From 6bcf682a563aab2720e18af633cef27abd89613b Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 16:14:12 -0700 Subject: [PATCH 01/14] clean up imports --- tests/integration/agents/test_agents.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py index a542e5403e..bc0e679bd4 100644 --- a/tests/integration/agents/test_agents.py +++ b/tests/integration/agents/test_agents.py @@ -10,8 +10,7 @@ import pytest from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.event_logger import EventLogger -from llama_stack_client.types.agents.turn_create_params import Document as AgentDocument -from llama_stack_client.types.memory_insert_params import Document +from llama_stack_client.types.agents.turn_create_params import Document from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig from llama_stack.apis.agents.agents import ( @@ -242,7 +241,7 @@ def test_code_interpreter_for_attachments(llama_stack_client_with_mocked_inferen codex_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config) session_id = codex_agent.create_session(f"test-session-{uuid4()}") - inflation_doc = AgentDocument( + inflation_doc = Document( content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv", mime_type="text/csv", ) From fa480149ddc833c2cc7d6b3bd598d8a194af8335 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 16:21:55 -0700 Subject: [PATCH 02/14] regen --- tests/integration/datasetio/test_datasetio.py | 1 + .../recorded_responses/chat_completion.json | 17368 +++++++++++++--- .../recorded_responses/invoke_tool.json | 55 +- tests/integration/scoring/test_scoring.py | 6 + 4 files changed, 14451 insertions(+), 2979 deletions(-) diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py index f112071a62..5b1d1a37a0 100644 --- a/tests/integration/datasetio/test_datasetio.py +++ b/tests/integration/datasetio/test_datasetio.py @@ -99,3 +99,4 @@ def test_get_rows_paginated(llama_stack_client): assert isinstance(response.rows, list) assert len(response.rows) == 2 assert response.next_page_token == "5" + llama_stack_client.datasets.unregister("test_dataset") diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 7234b6c31d..9148386248 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -26738,7 +26738,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -26766,27 +26766,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " provided function definitions", + "text": "[", "type": "text" }, "event_type": { @@ -26806,7 +26786,7 @@ "data": { "event": { "delta": { - "text": " are not suitable", + "text": "get_boiling_point(liquid", "type": "text" }, "event_type": { @@ -26826,7 +26806,7 @@ "data": { "event": { "delta": { - "text": " for this task. Please re", + "text": "_name=\"polyjuice\",", "type": "text" }, "event_type": { @@ -26846,7 +26826,7 @@ "data": { "event": { "delta": { - "text": "work them to", + "text": " celcius=True)]", "type": "text" }, "event_type": { @@ -26866,8 +26846,20 @@ "data": { "event": { "delta": { - "text": " align with the task requirements.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "a985b4ae-b665-4931-baea-8dc633a063a4", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -26875,7 +26867,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -26908,16 +26904,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "D2n_IS_8", + "span_id": "GzGznnkt", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021393+00:00", + "__datetime__": "2025-03-12T23:16:44.978488+00:00", "__module__": "datetime" }, - "trace_id": "amAiZv5PQKSsA74j", + "trace_id": "78cDdNNeSnusAfVf", "type": "metric", "unit": "tokens", - "value": 90 + "value": 231 }, { "attributes": { @@ -26925,16 +26921,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "D2n_IS_8", + "span_id": "GzGznnkt", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021420+00:00", + "__datetime__": "2025-03-12T23:16:44.978521+00:00", "__module__": "datetime" }, - "trace_id": "amAiZv5PQKSsA74j", + "trace_id": "78cDdNNeSnusAfVf", "type": "metric", "unit": "tokens", - "value": 32 + "value": 28 }, { "attributes": { @@ -26942,16 +26938,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "D2n_IS_8", + "span_id": "GzGznnkt", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021427+00:00", + "__datetime__": "2025-03-12T23:16:44.978527+00:00", "__module__": "datetime" }, - "trace_id": "amAiZv5PQKSsA74j", + "trace_id": "78cDdNNeSnusAfVf", "type": "metric", "unit": "tokens", - "value": 122 + "value": 259 } ] } @@ -26959,7 +26955,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27007,7 +27003,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": "get_boiling_point", "type": "text" }, "event_type": { @@ -27027,7 +27023,67 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": "(liquid_name=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "polyjuice\",", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " celcius=False", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ")]", "type": "text" }, "event_type": { @@ -27054,10 +27110,10 @@ }, "tool_call": { "arguments": { - "celcius": true, + "celcius": false, "liquid_name": "polyjuice" }, - "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", + "call_id": "4736e424-7686-434d-8365-e1ecd942772e", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27105,16 +27161,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "YhFB39Ik", + "span_id": "W_ToMqBJ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335148+00:00", + "__datetime__": "2025-03-12T23:16:43.429519+00:00", "__module__": "datetime" }, - "trace_id": "3n2xEtjLQt6ZGVR_", + "trace_id": "KJPwXGhmQuK-RWvz", "type": "metric", "unit": "tokens", - "value": 267 + "value": 184 }, { "attributes": { @@ -27122,13 +27178,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "YhFB39Ik", + "span_id": "W_ToMqBJ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335179+00:00", + "__datetime__": "2025-03-12T23:16:43.429546+00:00", "__module__": "datetime" }, - "trace_id": "3n2xEtjLQt6ZGVR_", + "trace_id": "KJPwXGhmQuK-RWvz", "type": "metric", "unit": "tokens", "value": 28 @@ -27139,16 +27195,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "YhFB39Ik", + "span_id": "W_ToMqBJ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335185+00:00", + "__datetime__": "2025-03-12T23:16:43.429552+00:00", "__module__": "datetime" }, - "trace_id": "3n2xEtjLQt6ZGVR_", + "trace_id": "KJPwXGhmQuK-RWvz", "type": "metric", "unit": "tokens", - "value": 295 + "value": 212 } ] } @@ -27156,7 +27212,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27204,7 +27260,67 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": "get_boiling", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_point(liquid_name", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "=\"polyjuice", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\", celcius", "type": "text" }, "event_type": { @@ -27254,7 +27370,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", + "call_id": "377375df-0ee4-4a96-b7c7-9b26aa6bf7e4", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27302,16 +27418,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "lnqeV_cZ", + "span_id": "kv_YUQAA", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708270+00:00", + "__datetime__": "2025-03-12T23:16:41.422850+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "ynpACzVfQD6zAEOf", "type": "metric", "unit": "tokens", - "value": 211 + "value": 137 }, { "attributes": { @@ -27319,13 +27435,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "lnqeV_cZ", + "span_id": "kv_YUQAA", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708281+00:00", + "__datetime__": "2025-03-12T23:16:41.422897+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "ynpACzVfQD6zAEOf", "type": "metric", "unit": "tokens", "value": 28 @@ -27336,16 +27452,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "lnqeV_cZ", + "span_id": "kv_YUQAA", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708284+00:00", + "__datetime__": "2025-03-12T23:16:41.422904+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "ynpACzVfQD6zAEOf", "type": "metric", "unit": "tokens", - "value": 239 + "value": 165 } ] } @@ -27353,7 +27469,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27401,7 +27517,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": "get_boiling", "type": "text" }, "event_type": { @@ -27421,7 +27537,67 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": "_point(liquid_name", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "=\"polyjuice", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\", celcius", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "=False)]", "type": "text" }, "event_type": { @@ -27448,10 +27624,10 @@ }, "tool_call": { "arguments": { - "celcius": true, + "celcius": false, "liquid_name": "polyjuice" }, - "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", + "call_id": "8aa692c5-bc2e-4e77-80ca-749b27386818", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27499,16 +27675,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "TDJHPVDZ", + "span_id": "pNtd6Xcf", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195776+00:00", + "__datetime__": "2025-03-12T23:16:39.488074+00:00", "__module__": "datetime" }, - "trace_id": "r2GKj8iqTYaNxTeq", + "trace_id": "G34m9Yx8QIG_hNx-", "type": "metric", "unit": "tokens", - "value": 155 + "value": 90 }, { "attributes": { @@ -27516,13 +27692,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "TDJHPVDZ", + "span_id": "pNtd6Xcf", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195808+00:00", + "__datetime__": "2025-03-12T23:16:39.488109+00:00", "__module__": "datetime" }, - "trace_id": "r2GKj8iqTYaNxTeq", + "trace_id": "G34m9Yx8QIG_hNx-", "type": "metric", "unit": "tokens", "value": 28 @@ -27533,16 +27709,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "TDJHPVDZ", + "span_id": "pNtd6Xcf", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195814+00:00", + "__datetime__": "2025-03-12T23:16:39.488115+00:00", "__module__": "datetime" }, - "trace_id": "r2GKj8iqTYaNxTeq", + "trace_id": "G34m9Yx8QIG_hNx-", "type": "metric", "unit": "tokens", - "value": 183 + "value": 118 } ] } @@ -27550,7 +27726,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27648,7 +27824,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "135d468e-6391-401d-a3c0-3b08c3a6eb8c", + "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27696,16 +27872,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "8pZtsyNW", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321089+00:00", + "__datetime__": "2025-03-07T01:44:31.335148+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", - "value": 99 + "value": 267 }, { "attributes": { @@ -27713,13 +27889,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "8pZtsyNW", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321130+00:00", + "__datetime__": "2025-03-07T01:44:31.335179+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", "value": 28 @@ -27730,16 +27906,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "8pZtsyNW", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321140+00:00", + "__datetime__": "2025-03-07T01:44:31.335185+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", - "value": 127 + "value": 295 } ] } @@ -27747,7 +27923,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27795,7 +27971,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -27815,7 +27991,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": "=True)]", "type": "text" }, "event_type": { @@ -27845,7 +28021,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "3955f756-9aa0-433f-be8f-af8941c220de", + "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27893,16 +28069,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "QZ6PSGpT", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629456+00:00", + "__datetime__": "2025-03-07T01:44:29.708270+00:00", "__module__": "datetime" }, - "trace_id": "M72bosg8TBe3uhx3", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", - "value": 43 + "value": 211 }, { "attributes": { @@ -27910,13 +28086,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "QZ6PSGpT", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629488+00:00", + "__datetime__": "2025-03-07T01:44:29.708281+00:00", "__module__": "datetime" }, - "trace_id": "M72bosg8TBe3uhx3", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", "value": 28 @@ -27927,16 +28103,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "QZ6PSGpT", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629494+00:00", + "__datetime__": "2025-03-07T01:44:29.708284+00:00", "__module__": "datetime" }, - "trace_id": "M72bosg8TBe3uhx3", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", - "value": 71 + "value": 239 } ] } @@ -27944,7 +28120,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27972,47 +28148,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " function call returned an", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error since", + "text": "[", "type": "text" }, "event_type": { @@ -28032,7 +28168,7 @@ "data": { "event": { "delta": { - "text": " \"", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -28052,7 +28188,7 @@ "data": { "event": { "delta": { - "text": "polyjuice\" is", + "text": "=True)]", "type": "text" }, "event_type": { @@ -28072,8 +28208,20 @@ "data": { "event": { "delta": { - "text": " not a real liquid. Polyju", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -28081,7 +28229,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -28092,53 +28244,94 @@ "data": { "event": { "delta": { - "text": "ice is a fictional substance from the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Harry Potter series. The boiling point", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "TDJHPVDZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:28.195776+00:00", + "__module__": "datetime" + }, + "trace_id": "r2GKj8iqTYaNxTeq", + "type": "metric", + "unit": "tokens", + "value": 155 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "TDJHPVDZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:28.195808+00:00", + "__module__": "datetime" + }, + "trace_id": "r2GKj8iqTYaNxTeq", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "TDJHPVDZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:28.195814+00:00", + "__module__": "datetime" + }, + "trace_id": "r2GKj8iqTYaNxTeq", + "type": "metric", + "unit": "tokens", + "value": 183 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " of a substance is a physical", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -28152,7 +28345,7 @@ "data": { "event": { "delta": { - "text": " property that can be measured and", + "text": "[", "type": "text" }, "event_type": { @@ -28172,7 +28365,7 @@ "data": { "event": { "delta": { - "text": " quantified", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -28192,7 +28385,7 @@ "data": { "event": { "delta": { - "text": ", but it only applies", + "text": "=True)]", "type": "text" }, "event_type": { @@ -28212,8 +28405,20 @@ "data": { "event": { "delta": { - "text": " to real substances that exist in the physical world.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "135d468e-6391-401d-a3c0-3b08c3a6eb8c", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -28221,7 +28426,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -28254,16 +28463,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "y9SHtJTQ", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411612+00:00", + "__datetime__": "2025-03-07T01:47:51.321089+00:00", "__module__": "datetime" }, - "trace_id": "_I2Cu85IRtOSBSX9", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", - "value": 84 + "value": 99 }, { "attributes": { @@ -28271,16 +28480,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "y9SHtJTQ", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411644+00:00", + "__datetime__": "2025-03-07T01:47:51.321130+00:00", "__module__": "datetime" }, - "trace_id": "_I2Cu85IRtOSBSX9", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", - "value": 73 + "value": 28 }, { "attributes": { @@ -28288,16 +28497,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "y9SHtJTQ", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411650+00:00", + "__datetime__": "2025-03-07T01:47:51.321140+00:00", "__module__": "datetime" }, - "trace_id": "_I2Cu85IRtOSBSX9", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", - "value": 157 + "value": 127 } ] } @@ -28305,7 +28514,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28333,7 +28542,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[", "type": "text" }, "event_type": { @@ -28353,7 +28562,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": "get_boiling_point", "type": "text" }, "event_type": { @@ -28373,7 +28582,7 @@ "data": { "event": { "delta": { - "text": " recognized.", + "text": "(liquid_name='polyju", "type": "text" }, "event_type": { @@ -28393,17 +28602,93 @@ "data": { "event": { "delta": { - "text": "", + "text": "ice', celcius=True", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "b30bc9bd-2ba2-4016-a319-a5321c217282", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", "__module__": "llama_stack.models.llama.datatypes", "value": "end_of_turn" } @@ -28415,16 +28700,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "XJHIgX_A", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401637+00:00", + "__datetime__": "2025-03-12T23:16:37.978838+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "FvxBc5KZSX2OJ-XT", "type": "metric", "unit": "tokens", - "value": 93 + "value": 43 }, { "attributes": { @@ -28432,16 +28717,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "XJHIgX_A", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401666+00:00", + "__datetime__": "2025-03-12T23:16:37.978902+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "FvxBc5KZSX2OJ-XT", "type": "metric", "unit": "tokens", - "value": 20 + "value": 28 }, { "attributes": { @@ -28449,16 +28734,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "XJHIgX_A", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401670+00:00", + "__datetime__": "2025-03-12T23:16:37.978910+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "FvxBc5KZSX2OJ-XT", "type": "metric", "unit": "tokens", - "value": 113 + "value": 71 } ] } @@ -28466,7 +28751,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28514,67 +28799,7 @@ "data": { "event": { "delta": { - "text": " function get_bo", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "iling_point_with_metadata does not exist,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " I will", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " assume you", + "text": " function call returned an", "type": "text" }, "event_type": { @@ -28594,7 +28819,7 @@ "data": { "event": { "delta": { - "text": " meant get_bo", + "text": " error", "type": "text" }, "event_type": { @@ -28614,7 +28839,7 @@ "data": { "event": { "delta": { - "text": "iling_point_with_metadata", + "text": ", poly", "type": "text" }, "event_type": { @@ -28634,7 +28859,7 @@ "data": { "event": { "delta": { - "text": ". The boiling point of polyjuice", + "text": "juice is not", "type": "text" }, "event_type": { @@ -28654,7 +28879,7 @@ "data": { "event": { "delta": { - "text": " is -100.", + "text": " a real liquid.", "type": "text" }, "event_type": { @@ -28696,16 +28921,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "8dM6i5mO", + "span_id": "i15F3AnP", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329281+00:00", + "__datetime__": "2025-03-12T23:18:16.174015+00:00", "__module__": "datetime" }, - "trace_id": "zMJDP5dXRrChi7uE", + "trace_id": "0IGoDzUNTxC53bAN", "type": "metric", "unit": "tokens", - "value": 86 + "value": 84 }, { "attributes": { @@ -28713,16 +28938,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "8dM6i5mO", + "span_id": "i15F3AnP", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329312+00:00", + "__datetime__": "2025-03-12T23:18:16.174351+00:00", "__module__": "datetime" }, - "trace_id": "zMJDP5dXRrChi7uE", + "trace_id": "0IGoDzUNTxC53bAN", "type": "metric", "unit": "tokens", - "value": 45 + "value": 26 }, { "attributes": { @@ -28730,16 +28955,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "8dM6i5mO", + "span_id": "i15F3AnP", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329318+00:00", + "__datetime__": "2025-03-12T23:18:16.174361+00:00", "__module__": "datetime" }, - "trace_id": "zMJDP5dXRrChi7uE", + "trace_id": "0IGoDzUNTxC53bAN", "type": "metric", "unit": "tokens", - "value": 131 + "value": 110 } ] } @@ -28747,7 +28972,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28795,27 +29020,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point_with_metadata(", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "liquid_name=\"polyjuice\", celcius=True) should be", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -28835,7 +29040,7 @@ "data": { "event": { "delta": { - "text": " used to get the answer.", + "text": " recognized.", "type": "text" }, "event_type": { @@ -28877,16 +29082,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "pzQMKAJc", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809816+00:00", + "__datetime__": "2025-03-07T01:45:55.401637+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 97 + "value": 93 }, { "attributes": { @@ -28894,16 +29099,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "pzQMKAJc", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809911+00:00", + "__datetime__": "2025-03-07T01:45:55.401666+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 39 + "value": 20 }, { "attributes": { @@ -28911,16 +29116,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "pzQMKAJc", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809922+00:00", + "__datetime__": "2025-03-07T01:45:55.401670+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 136 + "value": 113 } ] } @@ -28928,7 +29133,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28956,7 +29161,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -28976,7 +29181,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": " function call should have", "type": "text" }, "event_type": { @@ -28996,7 +29201,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": " been get", "type": "text" }, "event_type": { @@ -29016,20 +29221,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "328cb19d-47bb-47cc-8258-a5ca2e26803e", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": "_boiling_point_with", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29037,11 +29230,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -29052,97 +29241,16 @@ "data": { "event": { "delta": { - "text": "", + "text": "_metadata, I", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "dS0bhfN_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324788+00:00", - "__module__": "datetime" - }, - "trace_id": "UJz5Cas1SDyQYeBk", - "type": "metric", - "unit": "tokens", - "value": 37 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "dS0bhfN_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324835+00:00", - "__module__": "datetime" - }, - "trace_id": "UJz5Cas1SDyQYeBk", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "dS0bhfN_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324844+00:00", - "__module__": "datetime" - }, - "trace_id": "UJz5Cas1SDyQYeBk", - "type": "metric", - "unit": "tokens", - "value": 65 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null + "stop_reason": null }, "metrics": null } @@ -29153,7 +29261,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": " will", "type": "text" }, "event_type": { @@ -29173,7 +29281,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point_with_metadata", + "text": " make sure to use", "type": "text" }, "event_type": { @@ -29193,7 +29301,7 @@ "data": { "event": { "delta": { - "text": "(liquid_name='polyjuice', cel", + "text": " the correct function name in", "type": "text" }, "event_type": { @@ -29213,7 +29321,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": " the future.", "type": "text" }, "event_type": { @@ -29227,42 +29335,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "5bb48d00-7d5c-49e2-bddf-e5fdc5f35485", - "tool_name": "get_boiling_point_with_metadata" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -29291,16 +29363,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "mfrFN7m2", + "span_id": "2UaLOS7T", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136501+00:00", + "__datetime__": "2025-03-12T23:18:19.016688+00:00", "__module__": "datetime" }, - "trace_id": "T4eddr4-SMWPQwKA", + "trace_id": "tm3A32woQsmtUmLd", "type": "metric", "unit": "tokens", - "value": 37 + "value": 86 }, { "attributes": { @@ -29308,16 +29380,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "mfrFN7m2", + "span_id": "2UaLOS7T", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136529+00:00", + "__datetime__": "2025-03-12T23:18:19.016723+00:00", "__module__": "datetime" }, - "trace_id": "T4eddr4-SMWPQwKA", + "trace_id": "tm3A32woQsmtUmLd", "type": "metric", "unit": "tokens", - "value": 30 + "value": 37 }, { "attributes": { @@ -29325,16 +29397,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "mfrFN7m2", + "span_id": "2UaLOS7T", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136535+00:00", + "__datetime__": "2025-03-12T23:18:19.016730+00:00", "__module__": "datetime" }, - "trace_id": "T4eddr4-SMWPQwKA", + "trace_id": "tm3A32woQsmtUmLd", "type": "metric", "unit": "tokens", - "value": 67 + "value": 123 } ] } @@ -29342,7 +29414,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29370,27 +29442,7 @@ "data": { "event": { "delta": { - "text": "When", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " I answered the", + "text": "The", "type": "text" }, "event_type": { @@ -29410,7 +29462,7 @@ "data": { "event": { "delta": { - "text": " phone, the friendly", + "text": " function get_boiling_point_with_metadata(", "type": "text" }, "event_type": { @@ -29430,7 +29482,7 @@ "data": { "event": { "delta": { - "text": " voice on the other end said \"hello\"", + "text": "liquid_name=\"polyjuice\", celcius=True) should be", "type": "text" }, "event_type": { @@ -29450,7 +29502,7 @@ "data": { "event": { "delta": { - "text": " and asked how I was doing.", + "text": " used to get the answer.", "type": "text" }, "event_type": { @@ -29492,16 +29544,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "tJEuRhla", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044284+00:00", + "__datetime__": "2025-03-07T01:45:56.809816+00:00", "__module__": "datetime" }, - "trace_id": "bnDS7Z41TRO0UyfH", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 30 + "value": 97 }, { "attributes": { @@ -29509,16 +29561,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "tJEuRhla", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044312+00:00", + "__datetime__": "2025-03-07T01:45:56.809911+00:00", "__module__": "datetime" }, - "trace_id": "bnDS7Z41TRO0UyfH", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 34 + "value": 39 }, { "attributes": { @@ -29526,16 +29578,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "tJEuRhla", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044318+00:00", + "__datetime__": "2025-03-07T01:45:56.809922+00:00", "__module__": "datetime" }, - "trace_id": "bnDS7Z41TRO0UyfH", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 64 + "value": 136 } ] } @@ -29543,7 +29595,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29571,7 +29623,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "[", "type": "text" }, "event_type": { @@ -29591,7 +29643,7 @@ "data": { "event": { "delta": { - "text": " am not able", + "text": "get_boiling_point", "type": "text" }, "event_type": { @@ -29611,7 +29663,7 @@ "data": { "event": { "delta": { - "text": " to execute this task as", + "text": "(liquid_name='polyju", "type": "text" }, "event_type": { @@ -29631,7 +29683,7 @@ "data": { "event": { "delta": { - "text": " it exceeds the", + "text": "ice', celcius=True", "type": "text" }, "event_type": { @@ -29651,7 +29703,7 @@ "data": { "event": { "delta": { - "text": " limitations of the functions I", + "text": ")]", "type": "text" }, "event_type": { @@ -29671,8 +29723,20 @@ "data": { "event": { "delta": { - "text": " have been given.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "31855436-909b-43d0-9247-05d1f329d2e9", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29680,7 +29744,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -29713,16 +29781,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "5If5go-q", + "span_id": "X6bzcHl6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__datetime__": "2025-03-12T23:18:14.474786+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "-KkgkeBSQBq56Y7A", "type": "metric", "unit": "tokens", - "value": 433 + "value": 37 }, { "attributes": { @@ -29730,16 +29798,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "5If5go-q", + "span_id": "X6bzcHl6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__datetime__": "2025-03-12T23:18:14.474813+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "-KkgkeBSQBq56Y7A", "type": "metric", "unit": "tokens", - "value": 31 + "value": 28 }, { "attributes": { @@ -29747,16 +29815,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "5If5go-q", + "span_id": "X6bzcHl6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__datetime__": "2025-03-12T23:18:14.474820+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "-KkgkeBSQBq56Y7A", "type": "metric", "unit": "tokens", - "value": 464 + "value": 65 } ] } @@ -29764,7 +29832,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29792,13 +29860,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "[", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29817,13 +29880,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n# Load data\ndf =", - "type": "tool_call" + "text": "get_boiling_point", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29842,13 +29900,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " pd.read_csv(\"/var/folders/rb/qv8vwgyj", - "type": "tool_call" + "text": "_with_metadata(liquid_name='", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29867,13 +29920,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "6yjd3t4pwsy9t0rm0000", - "type": "tool_call" + "text": "polyjuice', cel", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29892,13 +29940,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gn/T/tmp2x_sml66/ZEjbinQHin", - "type": "tool_call" + "text": "cius=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29920,9 +29963,16 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "85d67373-4070-43d2-a484-6d4a97c85e22", + "tool_name": "get_boiling_point_with_metadata" }, - "tool_call": "flation.csv\")\n# Rows\nprint(\"Number of rows and columns in the", "type": "tool_call" }, "event_type": { @@ -29931,7 +29981,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -29942,38 +29996,109 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "bbYbxoH7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:18:17.392364+00:00", + "__module__": "datetime" + }, + "trace_id": "8q7Ao94XQM-Wh7uH", + "type": "metric", + "unit": "tokens", + "value": 37 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "bbYbxoH7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:18:17.392459+00:00", + "__module__": "datetime" + }, + "trace_id": "8q7Ao94XQM-Wh7uH", + "type": "metric", + "unit": "tokens", + "value": 30 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "bbYbxoH7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:18:17.392474+00:00", + "__module__": "datetime" + }, + "trace_id": "8q7Ao94XQM-Wh7uH", + "type": "metric", + "unit": "tokens", + "value": 67 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data", - "type": "tool_call" + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "When", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29992,13 +30117,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df", - "type": "tool_call" + "text": " I answered", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30017,13 +30137,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\n", - "type": "tool_call" + "text": " the phone,", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30042,13 +30157,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(df.head())", - "type": "tool_call" + "text": " the friendly voice", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30067,23 +30177,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" - }, - "call_id": "1df8b196-9eff-4b06-97e7-ab175c741e8f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " on the other end", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30091,11 +30186,47 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " said \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "hello\" and asked how I was doing.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -30128,16 +30259,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "fLqIbpek", + "span_id": "oVSdVF2W", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262304+00:00", + "__datetime__": "2025-03-12T23:15:58.571786+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "tPpkfz4pQZ2NBT2q", "type": "metric", "unit": "tokens", - "value": 235 + "value": 30 }, { "attributes": { @@ -30145,16 +30276,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "fLqIbpek", + "span_id": "oVSdVF2W", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262340+00:00", + "__datetime__": "2025-03-12T23:15:58.571835+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "tPpkfz4pQZ2NBT2q", "type": "metric", "unit": "tokens", - "value": 10 + "value": 34 }, { "attributes": { @@ -30162,16 +30293,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "fLqIbpek", + "span_id": "oVSdVF2W", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262347+00:00", + "__datetime__": "2025-03-12T23:15:58.571842+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "tPpkfz4pQZ2NBT2q", "type": "metric", "unit": "tokens", - "value": 245 + "value": 64 } ] } @@ -30179,7 +30310,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -30187,13 +30318,11623 @@ "data": { "event": { "delta": { - "text": "", + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " am not able", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to execute this task as", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it exceeds the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " limitations of the functions I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " have been given.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 433 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 31 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 464 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 'bwrap'", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " was not found.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " This is likely because", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file path provided", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is incorrect or the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file does not exist", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in the specified location", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n\nTo resolve this", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue, you should", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file path is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct and the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " exists in the specified", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location. If the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file is located in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a different directory,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you should provide the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct path to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Additionally,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you can use the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " `os` module", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to check if the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file exists before attempting", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to read it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Here's an example", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n\n```python", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\nimport os\nimport", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " pandas as pd\n\nfile_path", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " = \"/var/folders/r", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "b/qv8vwgy", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "j6yjd3t", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4pwsy9t", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "0rm0000gn/T", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/tmpjdr", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nryox/gEWH", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "hdSVin", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "flation.csv\"\n\nif", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " os.path.isfile(file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_path):\n ", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " df = pd.read", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_csv(file_path)\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " print(\"Number", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of rows and columns", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in the data", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\", df.shape)\n print", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(\"Columns of the data are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\", len(df", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".columns))\n print(\"Columns", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of the data are:\", df", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".columns)\n print(\"Dat", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "atype of the columns are:\",", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " df.dtypes)\n print", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(\"Data sample from file:\")\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " print(df.head())\nelse", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n print(\"The file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " does not exist\")\n``", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "`\n\nThis code checks if", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file exists before attempting", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to read it. If the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file does not exist, it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " prints a message indicating that the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file does not exist.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "UF2BeSUk", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:58.914120+00:00", + "__module__": "datetime" + }, + "trace_id": "PBS_ZwZnRYGrcPR-", + "type": "metric", + "unit": "tokens", + "value": 234 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "UF2BeSUk", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:58.914184+00:00", + "__module__": "datetime" + }, + "trace_id": "PBS_ZwZnRYGrcPR-", + "type": "metric", + "unit": "tokens", + "value": 302 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "UF2BeSUk", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:58.914191+00:00", + "__module__": "datetime" + }, + "trace_id": "PBS_ZwZnRYGrcPR-", + "type": "metric", + "unit": "tokens", + "value": 536 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Load data\ndf", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " = pd.read_csv(\"/var", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/folders/rb/qv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6y", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "jd3t4pws", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y9t0rm000", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0gn/T/tmpjdr", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "nryox/gEWH", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "hdSVinflation.csv\")\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Rows\nprint(\"Number", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of rows and columns in the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data:\", df.shape)\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Columns\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Columns of the data are", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", len(df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".columns))\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Column names\nprint(\"Columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of the data", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\", df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".columns)\n# Column dtypes", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint(\"", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Datatype of", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the columns are:\",", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " df.dtypes)\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Sample of data\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(\"Data sample", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " from file", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\")\nprint(df.head())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpjdrnryox/gEWHhdSVinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "14ba83c1-aa21-4866-8762-2deb55a67a45", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "DUI8IKmF", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:51.740204+00:00", + "__module__": "datetime" + }, + "trace_id": "PBS_ZwZnRYGrcPR-", + "type": "metric", + "unit": "tokens", + "value": 37 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "DUI8IKmF", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:51.740245+00:00", + "__module__": "datetime" + }, + "trace_id": "PBS_ZwZnRYGrcPR-", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "DUI8IKmF", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:51.740252+00:00", + "__module__": "datetime" + }, + "trace_id": "PBS_ZwZnRYGrcPR-", + "type": "metric", + "unit": "tokens", + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vwgyj6yjd3t4pwsy9t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm0000gn/T/tmp2x_sml66/9vY", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vmVRoinflation.csv\" does not exist. This could be due to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a variety of reasons such as the file being deleted, the path being incorrect", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " try the following:\n\n1. Check the file path: Ensure that the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path is correct and the file exists at that location.\n2. Check file permissions:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Ensure that the file is accessible and you have the necessary permissions to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " read it.\n3. Try a different file: If the file is not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " accessible, try loading a different file to see if the issue is specific to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this file or a general issue with your code.\n4. Check for ty", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "pos: Ensure that there are no typos in the file path or the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you are using, and I'll be happy to help further.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262530+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 680 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262555+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 238 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262558+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 918 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6yjd3t4pwsy9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'], format='%Y')\n\n# Group by", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation as a time series\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Inflation'], marker='o')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953806+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 432 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953843+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953847+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 442 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, the error message mentions \\\"bwrap\\\" which is a command-line tool for running programs in a sandboxed environment. It's possible that the issue is related to the environment in which the code is being run, rather than the code itself.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"/var/folders/rb", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/qv8vwgyj", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "6yjd3t4", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "pwsy9t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm0000gn/T/tmp", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "jdrnryox/", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "6q7CwY", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "m0inflation.csv\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " does not exist. This could", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " be due to a number of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons such as the file being", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " deleted, the path being incorrect", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", or the file not being", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " accessible.\n\nTo resolve this issue", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", you should ensure that the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file exists and the path is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct. If the file does", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not exist, you will need", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to create it or obtain it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " from the relevant source. If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the path is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " incorrect, you will need to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " update the path to the correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location of the file.\n\nAdditionally", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", the error message mentions \"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "bwrap\" which is a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " command-line tool", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " for running programs in a sandbox", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "ed environment. It's possible", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " that the issue is related to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the environment in which the code", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is being run, rather than", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the code itself.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "gOIUTtiI", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:31.326082+00:00", + "__module__": "datetime" + }, + "trace_id": "yeHsGm3mQxqHTxdk", + "type": "metric", + "unit": "tokens", + "value": 655 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "gOIUTtiI", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:31.326098+00:00", + "__module__": "datetime" + }, + "trace_id": "yeHsGm3mQxqHTxdk", + "type": "metric", + "unit": "tokens", + "value": 207 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "gOIUTtiI", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:31.326100+00:00", + "__module__": "datetime" + }, + "trace_id": "yeHsGm3mQxqHTxdk", + "type": "metric", + "unit": "tokens", + "value": 862 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, the error message mentions \\\"bwrap\\\" which is a command-line tool for running programs in a sandboxed environment. It's possible that the issue is related to the environment in which the code is being run, rather than the code itself.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " matplotlib.pyplot as", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " plt\n\n# Load", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the CSV file\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df = pd.read", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/var/f", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "olders/rb/q", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "v8vwgy", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "j6yjd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "3t4p", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "wsy9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T/tmpj", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "drnryox", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/6q7", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "CwYm0", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "inflation.csv\")\n\n# Convert", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the 'Year' column", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " to datetime\ndf", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "['Year'] =", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " pd.to_datetime(df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "['Year'], format", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "='%Y')\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Group by", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year'", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " and calculate the average inflation\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df_avg_inflation = df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".groupby('Year')['Inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'].mean().reset_index()\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " as a time", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " series\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".figure(figsize=(10,6", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "))\nplt.plot(df_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Year'], df_avg", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_inflation['Inflation'],", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " marker='o')\nplt.title", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Average Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "ly Inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "')\nplt.xlabel('Year')\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.ylabel('Inflation')\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.grid(True)\nplt.show", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpjdrnryox/6q7CwYm0inflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f39928f4-df63-46b1-9ab7-269f5d80df83", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "SPVD5n3Z", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:28.011354+00:00", + "__module__": "datetime" + }, + "trace_id": "yeHsGm3mQxqHTxdk", + "type": "metric", + "unit": "tokens", + "value": 404 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "SPVD5n3Z", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:28.011382+00:00", + "__module__": "datetime" + }, + "trace_id": "yeHsGm3mQxqHTxdk", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "SPVD5n3Z", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:28.011389+00:00", + "__module__": "datetime" + }, + "trace_id": "yeHsGm3mQxqHTxdk", + "type": "metric", + "unit": "tokens", + "value": 414 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " due to a variety of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons such as the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being deleted, the path being incorrect, or the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not being accessible.\n\nTo resolve this issue, you can try", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the following:\n\n1. Check the file path: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file path is correct and the file exists at that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location.\n2. Check file permissions: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file is accessible and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you have the necessary permissions to read", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it.\n3. Try a different file: If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file is not accessible, try loading a different file to see", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " if the issue is specific to this file or a general", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue with your code.\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4. Check for typos: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " there are no typos in the file path or the code.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "If you are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " still having issues, please provide more details about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file and the code you are using", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", and I'll be happy to help further.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "KwfNrQLy", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:19.630894+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 192 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "KwfNrQLy", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:19.630987+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 238 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "KwfNrQLy", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:19.630996+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 430 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"/var/folders/rb", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/qv8vwgyj", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "6yjd3t4", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "pwsy9t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm0000gn/T/tmp", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "jdrnryox/", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "6q7CwY", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "m0in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "flation.csv\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " does not exist. This could", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " be due to a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " number of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons such as", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file being deleted, the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path being", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " incorrect, or the file not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being accessible.\n\nTo", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " resolve this", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue, you should ensure", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " that the file exists", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and the path is correct.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " If the file does not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " exist, you will need", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to create it or obtain it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " from the relevant", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " source. If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the path is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " incorrect, you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " will need to update the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path to the correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location of the file.\n\nAdditionally", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", the error", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " message mentions \"bwrap\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " which is a command", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "-line", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " tool", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " for running programs", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in a sandboxed", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " environment. It's", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " possible that the issue is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " related to the environment", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " which the code is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being run, rather", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " than", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the code itself.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "lgPGJmgn", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:22.739081+00:00", + "__module__": "datetime" + }, + "trace_id": "H5eQcwg3S5yEsFZA", + "type": "metric", + "unit": "tokens", + "value": 195 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "lgPGJmgn", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:22.739118+00:00", + "__module__": "datetime" + }, + "trace_id": "H5eQcwg3S5yEsFZA", + "type": "metric", + "unit": "tokens", + "value": 207 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "lgPGJmgn", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:22.739122+00:00", + "__module__": "datetime" + }, + "trace_id": "H5eQcwg3S5yEsFZA", + "type": "metric", + "unit": "tokens", + "value": 402 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Load the CSV file\ndf", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " = pd.read_csv(\"/var", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/folders/rb/qv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6y", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "jd3t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "4pwsy9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000gn/T", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmpjdrnryox", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/6q7Cw", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Ym0inflation.csv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\")\n\n# Print the first few", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " rows of the dataframe\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df.head())\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Print information about the dataframe\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(df.info())\n\n# Print", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " summary statistics of the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " dataframe\nprint(df.describe())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpjdrnryox/6q7CwYm0inflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics of the dataframe\nprint(df.describe())" + }, + "call_id": "104a058f-5fa5-4861-a2f4-28e09bf1dfbc", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qgUmXXsV", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:14.519707+00:00", + "__module__": "datetime" + }, + "trace_id": "H5eQcwg3S5yEsFZA", + "type": "metric", + "unit": "tokens", + "value": 36 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qgUmXXsV", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:14.519781+00:00", + "__module__": "datetime" + }, + "trace_id": "H5eQcwg3S5yEsFZA", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qgUmXXsV", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:14.519787+00:00", + "__module__": "datetime" + }, + "trace_id": "H5eQcwg3S5yEsFZA", + "type": "metric", + "unit": "tokens", + "value": 46 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search(query=\"using LoRA in Torchtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673350+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 107 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673375+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673381+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 130 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help. What's", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " your question about Torchtune?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179269+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 75 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179301+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 25 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179308+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 100 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7f524\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c553d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:4bcdb\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7f524\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c553d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "using LoRA in Tor", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "chtune\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "bbfbe149-e78a-4ec1-9cb9-37f47b482d31", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "6F-9YFWm", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:40.613104+00:00", + "__module__": "datetime" + }, + "trace_id": "gDuozGbVSrmg-3Tl", + "type": "metric", + "unit": "tokens", + "value": 108 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "6F-9YFWm", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:40.613124+00:00", + "__module__": "datetime" + }, + "trace_id": "gDuozGbVSrmg-3Tl", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "6F-9YFWm", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:40.613129+00:00", + "__module__": "datetime" + }, + "trace_id": "gDuozGbVSrmg-3Tl", + "type": "metric", + "unit": "tokens", + "value": 131 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7f524\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c553d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to help.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " What's your first", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " question about Torcht", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "une?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "dVzcpbWR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:38.650749+00:00", + "__module__": "datetime" + }, + "trace_id": "sElmvWPvRneQHEaY", + "type": "metric", + "unit": "tokens", + "value": 75 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "dVzcpbWR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:38.650820+00:00", + "__module__": "datetime" + }, + "trace_id": "sElmvWPvRneQHEaY", + "type": "metric", + "unit": "tokens", + "value": 26 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "dVzcpbWR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:38.650834+00:00", + "__module__": "datetime" + }, + "trace_id": "sElmvWPvRneQHEaY", + "type": "metric", + "unit": "tokens", + "value": 101 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search(query=\"using LoRA in Torchtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209198+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 108 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209239+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209247+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 131 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help. What's", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " your first question about Torchtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525734+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 75 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525763+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 26 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525770+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 101 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9cb06\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:48279\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ed09a\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "_state_dict(base_model.state_dict", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -30572,7 +42208,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "(), strict=False", "type": "text" }, "event_type": { @@ -30592,7 +42228,7 @@ "data": { "event": { "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", + "text": ")`\n4. Set only Lo", "type": "text" }, "event_type": { @@ -30612,7 +42248,7 @@ "data": { "event": { "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", + "text": "RA parameters to trainable: `", "type": "text" }, "event_type": { @@ -30632,7 +42268,7 @@ "data": { "event": { "delta": { - "text": "rm0000gn/T/tmp2x_sml66/9vY", + "text": "from torch", "type": "text" }, "event_type": { @@ -30652,7 +42288,7 @@ "data": { "event": { "delta": { - "text": "vmVRoinflation.csv\" does not exist. This could be due to", + "text": "tune.modules", "type": "text" }, "event_type": { @@ -30672,7 +42308,7 @@ "data": { "event": { "delta": { - "text": " a variety of reasons such as the file being deleted, the path being incorrect", + "text": ".peft.peft_utils import", "type": "text" }, "event_type": { @@ -30692,7 +42328,7 @@ "data": { "event": { "delta": { - "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", + "text": " get_adapter_params, set_train", "type": "text" }, "event_type": { @@ -30712,7 +42348,7 @@ "data": { "event": { "delta": { - "text": " try the following:\n\n1. Check the file path: Ensure that the file", + "text": "able_params`\n5. Run", "type": "text" }, "event_type": { @@ -30732,7 +42368,7 @@ "data": { "event": { "delta": { - "text": " path is correct and the file exists at that location.\n2. Check file permissions:", + "text": " the LoRA finetune", "type": "text" }, "event_type": { @@ -30752,7 +42388,7 @@ "data": { "event": { "delta": { - "text": " Ensure that the file is accessible and you have the necessary permissions to", + "text": " using torchtune's", "type": "text" }, "event_type": { @@ -30772,7 +42408,7 @@ "data": { "event": { "delta": { - "text": " read it.\n3. Try a different file: If the file is not", + "text": " LoRA", "type": "text" }, "event_type": { @@ -30792,7 +42428,7 @@ "data": { "event": { "delta": { - "text": " accessible, try loading a different file to see if the issue is specific to", + "text": " recipe: `tune run", "type": "text" }, "event_type": { @@ -30812,7 +42448,7 @@ "data": { "event": { "delta": { - "text": " this file or a general issue with your code.\n4. Check for ty", + "text": " --", "type": "text" }, "event_type": { @@ -30832,7 +42468,7 @@ "data": { "event": { "delta": { - "text": "pos: Ensure that there are no typos in the file path or the", + "text": "nnodes 1 --", "type": "text" }, "event_type": { @@ -30852,7 +42488,7 @@ "data": { "event": { "delta": { - "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", + "text": "nproc_per_node 2", "type": "text" }, "event_type": { @@ -30872,7 +42508,7 @@ "data": { "event": { "delta": { - "text": " you are using, and I'll be happy to help further.", + "text": " lora_finetune_d", "type": "text" }, "event_type": { @@ -30892,94 +42528,73 @@ "data": { "event": { "delta": { - "text": "", + "text": "istributed --config llama2/", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262530+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 680 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "7B_lora`\n\nYou", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262555+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 238 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " can also experiment with different Lo", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262558+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 918 - } - ] + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "RA configurations, such as applying", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -30993,13 +42608,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": " LoRA to all", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31018,13 +42628,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" + "text": " linear layers in the self-", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31043,13 +42648,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" + "text": "attention, increasing the rank,", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31068,13 +42668,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "8vwgyj6yjd3t4pwsy9t", - "type": "tool_call" + "text": " and scaling", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31093,13 +42688,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", - "type": "tool_call" + "text": " alpha and rank together.\n\nNote", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31118,13 +42708,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", - "type": "tool_call" + "text": ": You need to have the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31143,13 +42728,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", - "type": "tool_call" + "text": " Llama2 weights and", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31168,13 +42748,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'], format='%Y')\n\n# Group by", - "type": "tool_call" + "text": " tokenizer downloaded and", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31193,13 +42768,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", - "type": "tool_call" + "text": " installed before running", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31218,13 +42788,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", - "type": "tool_call" + "text": " the LoRA finetune", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31243,13 +42808,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation as a time series\n", - "type": "tool_call" + "text": ". Additionally,", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31268,13 +42828,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", - "type": "tool_call" + "text": " you can use", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31293,13 +42848,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'], df_avg_in", - "type": "tool_call" + "text": " torcht", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31318,13 +42868,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['Inflation'], marker='o')\nplt", - "type": "tool_call" + "text": "une's `Wand", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31343,13 +42888,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", - "type": "tool_call" + "text": "BLogger` to generate loss", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31368,13 +42908,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", - "type": "tool_call" + "text": " curves and track your", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31393,23 +42928,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " experiments.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31417,11 +42937,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -31454,16 +42970,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "qQY5sAli", + "span_id": "ZNWpl4Yu", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953806+00:00", + "__datetime__": "2025-03-12T23:17:34.923841+00:00", "__module__": "datetime" }, - "trace_id": "8YKzpfybSiGgrHOF", + "trace_id": "Uom62a7_SM2JU4Mp", "type": "metric", "unit": "tokens", - "value": 432 + "value": 146 }, { "attributes": { @@ -31471,16 +42987,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "qQY5sAli", + "span_id": "ZNWpl4Yu", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953843+00:00", + "__datetime__": "2025-03-12T23:17:34.923877+00:00", "__module__": "datetime" }, - "trace_id": "8YKzpfybSiGgrHOF", + "trace_id": "Uom62a7_SM2JU4Mp", "type": "metric", "unit": "tokens", - "value": 10 + "value": 294 }, { "attributes": { @@ -31488,16 +43004,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "qQY5sAli", + "span_id": "ZNWpl4Yu", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953847+00:00", + "__datetime__": "2025-03-12T23:17:34.923885+00:00", "__module__": "datetime" }, - "trace_id": "8YKzpfybSiGgrHOF", + "trace_id": "Uom62a7_SM2JU4Mp", "type": "metric", "unit": "tokens", - "value": 442 + "value": 440 } ] } @@ -31505,7 +43021,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9cb06\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:48279\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -31533,27 +43049,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", + "text": "[k", "type": "text" }, "event_type": { @@ -31573,7 +43069,7 @@ "data": { "event": { "delta": { - "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", + "text": "nowledge_search(query", "type": "text" }, "event_type": { @@ -31593,7 +43089,7 @@ "data": { "event": { "delta": { - "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", + "text": "=\"using Lo", "type": "text" }, "event_type": { @@ -31613,7 +43109,7 @@ "data": { "event": { "delta": { - "text": " due to a variety of", + "text": "RA in Torcht", "type": "text" }, "event_type": { @@ -31633,7 +43129,7 @@ "data": { "event": { "delta": { - "text": " reasons such as the file", + "text": "une\")]", "type": "text" }, "event_type": { @@ -31653,8 +43149,19 @@ "data": { "event": { "delta": { - "text": " being deleted, the path being incorrect, or the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "069baa18-792f-4268-bbd9-65499b6ca253", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31662,7 +43169,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -31673,33 +43184,94 @@ "data": { "event": { "delta": { - "text": " not being accessible.\n\nTo resolve this issue, you can try", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "EhlYWsJp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:28.344753+00:00", + "__module__": "datetime" + }, + "trace_id": "Uom62a7_SM2JU4Mp", + "type": "metric", + "unit": "tokens", + "value": 107 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "EhlYWsJp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:28.344791+00:00", + "__module__": "datetime" + }, + "trace_id": "Uom62a7_SM2JU4Mp", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "EhlYWsJp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:28.344798+00:00", + "__module__": "datetime" + }, + "trace_id": "Uom62a7_SM2JU4Mp", + "type": "metric", + "unit": "tokens", + "value": 130 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9cb06\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:48279\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " the following:\n\n1. Check the file path: Ensure that", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31713,7 +43285,7 @@ "data": { "event": { "delta": { - "text": " the file path is correct and the file exists at that", + "text": "I", "type": "text" }, "event_type": { @@ -31733,7 +43305,7 @@ "data": { "event": { "delta": { - "text": " location.\n2. Check file permissions: Ensure that", + "text": "'m ready", "type": "text" }, "event_type": { @@ -31753,7 +43325,7 @@ "data": { "event": { "delta": { - "text": " the file is accessible and", + "text": " to help.", "type": "text" }, "event_type": { @@ -31773,7 +43345,7 @@ "data": { "event": { "delta": { - "text": " you have the necessary permissions to read", + "text": " What", "type": "text" }, "event_type": { @@ -31793,7 +43365,7 @@ "data": { "event": { "delta": { - "text": " it.\n3. Try a different file: If", + "text": "'s your question about", "type": "text" }, "event_type": { @@ -31813,7 +43385,7 @@ "data": { "event": { "delta": { - "text": " the file is not accessible, try loading a different file to see", + "text": " Torchtune", "type": "text" }, "event_type": { @@ -31833,7 +43405,7 @@ "data": { "event": { "delta": { - "text": " if the issue is specific to this file or a general", + "text": "?", "type": "text" }, "event_type": { @@ -31853,33 +43425,94 @@ "data": { "event": { "delta": { - "text": " issue with your code.\n", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "aWF3H1iZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:24.710775+00:00", + "__module__": "datetime" + }, + "trace_id": "9fRx-MuMQbieYC8_", + "type": "metric", + "unit": "tokens", + "value": 75 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "aWF3H1iZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:24.710874+00:00", + "__module__": "datetime" + }, + "trace_id": "9fRx-MuMQbieYC8_", + "type": "metric", + "unit": "tokens", + "value": 25 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "aWF3H1iZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:24.710888+00:00", + "__module__": "datetime" + }, + "trace_id": "9fRx-MuMQbieYC8_", + "type": "metric", + "unit": "tokens", + "value": 100 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "4. Check for typos: Ensure that", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31893,7 +43526,7 @@ "data": { "event": { "delta": { - "text": " there are no typos in the file path or the code.\n\n", + "text": "[k", "type": "text" }, "event_type": { @@ -31913,7 +43546,7 @@ "data": { "event": { "delta": { - "text": "If you are", + "text": "nowledge_search(query=\"", "type": "text" }, "event_type": { @@ -31933,7 +43566,7 @@ "data": { "event": { "delta": { - "text": " still having issues, please provide more details about", + "text": "Torchtune documentation", "type": "text" }, "event_type": { @@ -31953,7 +43586,7 @@ "data": { "event": { "delta": { - "text": " the file and the code you are using", + "text": "\")]", "type": "text" }, "event_type": { @@ -31973,8 +43606,19 @@ "data": { "event": { "delta": { - "text": ", and I'll be happy to help further.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Torchtune documentation" + }, + "call_id": "7ca12bd0-f629-4f23-9b14-a6f277b28a81", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31982,7 +43626,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -32015,16 +43663,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "KwfNrQLy", + "span_id": "o5JuB0Ip", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630894+00:00", + "__datetime__": "2025-03-12T23:17:37.465863+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "sElmvWPvRneQHEaY", "type": "metric", "unit": "tokens", - "value": 192 + "value": 39 }, { "attributes": { @@ -32032,16 +43680,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "KwfNrQLy", + "span_id": "o5JuB0Ip", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630987+00:00", + "__datetime__": "2025-03-12T23:17:37.465891+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "sElmvWPvRneQHEaY", "type": "metric", "unit": "tokens", - "value": 238 + "value": 20 }, { "attributes": { @@ -32049,113 +43697,38 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "KwfNrQLy", + "span_id": "o5JuB0Ip", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630996+00:00", + "__datetime__": "2025-03-12T23:17:37.465897+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "sElmvWPvRneQHEaY", "type": "metric", "unit": "tokens", - "value": 430 + "value": 59 } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -32169,13 +43742,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "jd3t4pwsy9t0rm0000gn/T", - "type": "tool_call" + "text": "L", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32194,13 +43762,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmp2x_sml66/9vYvmVRoinflation.csv", - "type": "tool_call" + "text": "lama3-8", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32219,13 +43782,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n", - "type": "tool_call" + "text": "B uses grouped", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32244,13 +43802,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print information about the dataframe\nprint(df", - "type": "tool_call" + "text": "-query attention instead of", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32269,13 +43822,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".info())\n\n# Print summary statistics about the dataframe\nprint(df.describe", - "type": "tool_call" + "text": " the standard", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32294,13 +43842,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "())", - "type": "tool_call" + "text": " multi-head attention", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32319,23 +43862,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())" - }, - "call_id": "5bbfebeb-4360-4ef9-a9e2-4227a8e8c699", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": ".", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32343,11 +43871,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -32380,16 +43904,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "AyEX3So6", + "span_id": "TS8BB6CQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873486+00:00", + "__datetime__": "2025-03-12T23:17:20.390405+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "XVVAGxGOTOqhq9V1", "type": "metric", "unit": "tokens", - "value": 36 + "value": 80 }, { "attributes": { @@ -32397,16 +43921,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "AyEX3So6", + "span_id": "TS8BB6CQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873500+00:00", + "__datetime__": "2025-03-12T23:17:20.390438+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "XVVAGxGOTOqhq9V1", "type": "metric", "unit": "tokens", - "value": 10 + "value": 28 }, { "attributes": { @@ -32414,16 +43938,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "AyEX3So6", + "span_id": "TS8BB6CQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873503+00:00", + "__datetime__": "2025-03-12T23:17:20.390443+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "XVVAGxGOTOqhq9V1", "type": "metric", "unit": "tokens", - "value": 46 + "value": 108 } ] } @@ -32431,7 +43955,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -32459,7 +43983,7 @@ "data": { "event": { "delta": { - "text": "To", + "text": "L", "type": "text" }, "event_type": { @@ -32479,7 +44003,7 @@ "data": { "event": { "delta": { - "text": " use LoRA in Torchtune, you can follow", + "text": "lama3-8", "type": "text" }, "event_type": { @@ -32499,7 +44023,7 @@ "data": { "event": { "delta": { - "text": " these steps:\n\n1. Import the necessary modules: `", + "text": "B uses grouped-query", "type": "text" }, "event_type": { @@ -32519,7 +44043,7 @@ "data": { "event": { "delta": { - "text": "from torchtune.models.llama2 import llama2_7b", + "text": " attention instead of the standard", "type": "text" }, "event_type": { @@ -32539,7 +44063,7 @@ "data": { "event": { "delta": { - "text": ", lora_llama2_7b`\n2. Create a", + "text": " multi-head attention.", "type": "text" }, "event_type": { @@ -32559,13 +44083,94 @@ "data": { "event": { "delta": { - "text": " Llama2 model with LoRA: `lora", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "NLFDWegH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:16.053013+00:00", + "__module__": "datetime" + }, + "trace_id": "YuGKHtZmRseP3fC4", + "type": "metric", + "unit": "tokens", + "value": 80 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "NLFDWegH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:16.053042+00:00", + "__module__": "datetime" + }, + "trace_id": "YuGKHtZmRseP3fC4", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "NLFDWegH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:16.053045+00:00", + "__module__": "datetime" + }, + "trace_id": "YuGKHtZmRseP3fC4", + "type": "metric", + "unit": "tokens", + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -32579,7 +44184,7 @@ "data": { "event": { "delta": { - "text": "_model = lora_llama2_7b(lora_attn_modules", + "text": "[k", "type": "text" }, "event_type": { @@ -32599,7 +44204,7 @@ "data": { "event": { "delta": { - "text": "=[\"q_proj\", \"v_proj\"])`\n3. Load the", + "text": "nowledge_search(query=\"", "type": "text" }, "event_type": { @@ -32619,7 +44224,7 @@ "data": { "event": { "delta": { - "text": " pre-trained Llama2 weights into", + "text": "Llama3-8B", "type": "text" }, "event_type": { @@ -32639,7 +44244,7 @@ "data": { "event": { "delta": { - "text": " the LoRA model: `", + "text": " attention type\")]", "type": "text" }, "event_type": { @@ -32659,8 +44264,19 @@ "data": { "event": { "delta": { - "text": "lora_model.load_state_dict(base_model.state_dict(), strict=False)`\n4", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "f855c399-8087-4d70-b315-cbcdfc2e7c64", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32668,7 +44284,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -32679,33 +44299,94 @@ "data": { "event": { "delta": { - "text": ". Set only LoRA parameters to trainable: `from torch", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "n9XTUtxe", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:18.759878+00:00", + "__module__": "datetime" + }, + "trace_id": "XVVAGxGOTOqhq9V1", + "type": "metric", + "unit": "tokens", + "value": 40 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "n9XTUtxe", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:18.759959+00:00", + "__module__": "datetime" + }, + "trace_id": "XVVAGxGOTOqhq9V1", + "type": "metric", + "unit": "tokens", + "value": 24 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "n9XTUtxe", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:18.759970+00:00", + "__module__": "datetime" + }, + "trace_id": "XVVAGxGOTOqhq9V1", + "type": "metric", + "unit": "tokens", + "value": 64 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "tune.modules.peft.peft_utils import get_adapter_params, set_train", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -32719,7 +44400,7 @@ "data": { "event": { "delta": { - "text": "able_params`\n5. Run the LoRA finetune using torch", + "text": "[k", "type": "text" }, "event_type": { @@ -32739,7 +44420,7 @@ "data": { "event": { "delta": { - "text": "tune's LoRA recipe: `tune", + "text": "nowledge_search(query=\"", "type": "text" }, "event_type": { @@ -32759,7 +44440,7 @@ "data": { "event": { "delta": { - "text": " run --nnodes 1 --nproc_per_node ", + "text": "Llama3-", "type": "text" }, "event_type": { @@ -32779,7 +44460,7 @@ "data": { "event": { "delta": { - "text": "2 lora_finetune_distributed --config llama2/7B", + "text": "8B attention type\")]", "type": "text" }, "event_type": { @@ -32799,8 +44480,19 @@ "data": { "event": { "delta": { - "text": "_lora`\n\nYou can also experiment with different LoRA configurations, such as", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "a975cf93-4809-4ca9-8a4b-c42e116d58d0", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32808,7 +44500,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -32819,53 +44515,94 @@ "data": { "event": { "delta": { - "text": " applying LoRA to all linear layers in the self-attention", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", increasing the rank, and scaling alpha", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "861X8wcF", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:14.638649+00:00", + "__module__": "datetime" + }, + "trace_id": "YuGKHtZmRseP3fC4", + "type": "metric", + "unit": "tokens", + "value": 40 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "861X8wcF", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:14.638678+00:00", + "__module__": "datetime" + }, + "trace_id": "YuGKHtZmRseP3fC4", + "type": "metric", + "unit": "tokens", + "value": 24 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "861X8wcF", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:14.638685+00:00", + "__module__": "datetime" + }, + "trace_id": "YuGKHtZmRseP3fC4", + "type": "metric", + "unit": "tokens", + "value": 64 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company.\\\", \\\"score\\\": 0.21026355, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " and rank together.\n\nNote: You need to have the pre-trained Llama2 weights and tokenizer downloaded", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -32879,7 +44616,7 @@ "data": { "event": { "delta": { - "text": " and installed before running the LoRA finetune. Additionally,", + "text": "The", "type": "text" }, "event_type": { @@ -32899,7 +44636,7 @@ "data": { "event": { "delta": { - "text": " you may need to modify the config file to point to", + "text": " current CEO of Meta is Mark", "type": "text" }, "event_type": { @@ -32919,7 +44656,7 @@ "data": { "event": { "delta": { - "text": " the location of your Llama2 weights and tokenizer.", + "text": " Zuckerberg.", "type": "text" }, "event_type": { @@ -32961,16 +44698,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "4uwx07lA", + "span_id": "GW7-tnGo", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:34.698983+00:00", + "__datetime__": "2025-03-12T23:16:03.252268+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "2TwB_v0KTZWN9Q_U", "type": "metric", "unit": "tokens", - "value": 146 + "value": 1079 }, { "attributes": { @@ -32978,16 +44715,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "4uwx07lA", + "span_id": "GW7-tnGo", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:34.699031+00:00", + "__datetime__": "2025-03-12T23:16:03.252339+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "2TwB_v0KTZWN9Q_U", "type": "metric", "unit": "tokens", - "value": 296 + "value": 19 }, { "attributes": { @@ -32995,16 +44732,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "4uwx07lA", + "span_id": "GW7-tnGo", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:34.699038+00:00", + "__datetime__": "2025-03-12T23:16:03.252346+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "2TwB_v0KTZWN9Q_U", "type": "metric", "unit": "tokens", - "value": 442 + "value": 1098 } ] } @@ -33012,7 +44749,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -33040,27 +44777,7 @@ "data": { "event": { "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", + "text": "The", "type": "text" }, "event_type": { @@ -33080,7 +44797,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": " current CEO of Meta is Mark Zuckerberg.", "type": "text" }, "event_type": { @@ -33094,41 +44811,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -33157,16 +44839,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "vGtNmXNY", + "span_id": "oB7hDf6E", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673350+00:00", + "__datetime__": "2025-03-07T01:44:07.084924+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 107 + "value": 1145 }, { "attributes": { @@ -33174,16 +44856,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "vGtNmXNY", + "span_id": "oB7hDf6E", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673375+00:00", + "__datetime__": "2025-03-07T01:44:07.084934+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 23 + "value": 19 }, { "attributes": { @@ -33191,16 +44873,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "vGtNmXNY", + "span_id": "oB7hDf6E", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673381+00:00", + "__datetime__": "2025-03-07T01:44:07.084936+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 130 + "value": 1164 } ] } @@ -33208,7 +44890,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -33236,8 +44918,38 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "brave_search.call(query=\"", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33256,8 +44968,13 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What's", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "current CEO of Meta\")", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33276,8 +44993,23 @@ "data": { "event": { "delta": { - "text": " your question about Torchtune?", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "current CEO of Meta" + }, + "call_id": "1df09f78-9ead-4366-9f49-359aa0dfb9ef", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33285,7 +45017,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -33318,16 +45054,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "7n3WMt3R", + "span_id": "C6HBZWIl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179269+00:00", + "__datetime__": "2025-03-12T23:16:01.115343+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "2TwB_v0KTZWN9Q_U", "type": "metric", "unit": "tokens", - "value": 75 + "value": 34 }, { "attributes": { @@ -33335,16 +45071,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "7n3WMt3R", + "span_id": "C6HBZWIl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179301+00:00", + "__datetime__": "2025-03-12T23:16:01.115423+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "2TwB_v0KTZWN9Q_U", "type": "metric", "unit": "tokens", - "value": 25 + "value": 10 }, { "attributes": { @@ -33352,16 +45088,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "7n3WMt3R", + "span_id": "C6HBZWIl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179308+00:00", + "__datetime__": "2025-03-12T23:16:01.115438+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "2TwB_v0KTZWN9Q_U", "type": "metric", "unit": "tokens", - "value": 100 + "value": 44 } ] } @@ -33369,7 +45105,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -33397,27 +45133,7 @@ "data": { "event": { "delta": { - "text": "To", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " use LoRA in Torchtune, you can follow", + "text": "The", "type": "text" }, "event_type": { @@ -33437,7 +45153,7 @@ "data": { "event": { "delta": { - "text": " these steps:\n\n1. Import the necessary modules: `", + "text": " boiling point", "type": "text" }, "event_type": { @@ -33457,7 +45173,7 @@ "data": { "event": { "delta": { - "text": "from torchtune.models.llama2 import llama2_7b", + "text": " of polyjuice is -", "type": "text" }, "event_type": { @@ -33477,7 +45193,7 @@ "data": { "event": { "delta": { - "text": ", lora_llama2_7b`\n2. Create a", + "text": "100\u00b0C.", "type": "text" }, "event_type": { @@ -33497,53 +45213,94 @@ "data": { "event": { "delta": { - "text": " Llama2 model with LoRA: `lora", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "_model = lora_llama2_7b(lora_attn_modules", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "kHgxCJiI", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:01.920310+00:00", + "__module__": "datetime" + }, + "trace_id": "rkZwHytISlOlMq9O", + "type": "metric", + "unit": "tokens", + "value": 77 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "kHgxCJiI", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:01.920372+00:00", + "__module__": "datetime" + }, + "trace_id": "rkZwHytISlOlMq9O", + "type": "metric", + "unit": "tokens", + "value": 22 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "kHgxCJiI", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:17:01.920380+00:00", + "__module__": "datetime" + }, + "trace_id": "rkZwHytISlOlMq9O", + "type": "metric", + "unit": "tokens", + "value": 99 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "=[\"q_proj\", \"v_proj\"])`\n3. Load", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -33557,7 +45314,7 @@ "data": { "event": { "delta": { - "text": " the pre-trained Llama2 weights into", + "text": "The", "type": "text" }, "event_type": { @@ -33577,7 +45334,7 @@ "data": { "event": { "delta": { - "text": " the LoRA model: `lora_model.load_state", + "text": " boiling point of polyjuice", "type": "text" }, "event_type": { @@ -33597,7 +45354,7 @@ "data": { "event": { "delta": { - "text": "_dict(base_model.state_dict(), strict=False)`\n4. Set only Lo", + "text": " is -100\u00b0C.", "type": "text" }, "event_type": { @@ -33617,53 +45374,94 @@ "data": { "event": { "delta": { - "text": "RA parameters to trainable: `set_trainable_params(lora_model, get", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "_adapter_params(lora_model))`\n5. Run the", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "-RYggjae", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:35.674057+00:00", + "__module__": "datetime" + }, + "trace_id": "YT4rgoiUSCS70qhA", + "type": "metric", + "unit": "tokens", + "value": 77 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "-RYggjae", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:35.674090+00:00", + "__module__": "datetime" + }, + "trace_id": "YT4rgoiUSCS70qhA", + "type": "metric", + "unit": "tokens", + "value": 22 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "-RYggjae", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:35.674098+00:00", + "__module__": "datetime" + }, + "trace_id": "YT4rgoiUSCS70qhA", + "type": "metric", + "unit": "tokens", + "value": 99 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " LoRA finetune using torch", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -33677,7 +45475,7 @@ "data": { "event": { "delta": { - "text": "tune's LoRA", + "text": "The", "type": "text" }, "event_type": { @@ -33697,7 +45495,7 @@ "data": { "event": { "delta": { - "text": " recipe: `tune run --nnodes 1 --", + "text": " function get_bo", "type": "text" }, "event_type": { @@ -33717,7 +45515,7 @@ "data": { "event": { "delta": { - "text": "nproc_per_node 2 lora_finetune_distributed --config", + "text": "iling_point is not", "type": "text" }, "event_type": { @@ -33737,7 +45535,7 @@ "data": { "event": { "delta": { - "text": " llama2/7B_lora`\n\nYou can also experiment with different Lo", + "text": " able to", "type": "text" }, "event_type": { @@ -33757,7 +45555,7 @@ "data": { "event": { "delta": { - "text": "RA configurations, such as applying LoRA to all linear layers", + "text": " find the boiling point of", "type": "text" }, "event_type": { @@ -33777,7 +45575,7 @@ "data": { "event": { "delta": { - "text": " in the self-attention, increasing the rank, and", + "text": " \"polyjuice\"", "type": "text" }, "event_type": { @@ -33797,7 +45595,7 @@ "data": { "event": { "delta": { - "text": " scaling alpha and rank together.\n\nNote: You need to", + "text": " as it is not", "type": "text" }, "event_type": { @@ -33817,7 +45615,7 @@ "data": { "event": { "delta": { - "text": " have the Llama2 weights and tokenizer downloaded and installed before running the", + "text": " a real liquid.", "type": "text" }, "event_type": { @@ -33837,7 +45635,7 @@ "data": { "event": { "delta": { - "text": " LoRA finetune. Additionally, you can use", + "text": " Polyjuice is a", "type": "text" }, "event_type": { @@ -33857,7 +45655,7 @@ "data": { "event": { "delta": { - "text": " torchtune's `Wand", + "text": " fictional substance from the", "type": "text" }, "event_type": { @@ -33877,7 +45675,7 @@ "data": { "event": { "delta": { - "text": "BLogger` to generate loss curves and track your experiments", + "text": " Harry Potter series", "type": "text" }, "event_type": { @@ -33939,16 +45737,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "IZ8Q_jX_", + "span_id": "hcq3e4Mt", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:28.484818+00:00", + "__datetime__": "2025-03-12T23:16:48.744933+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "f1UWdr3yT5CpYBm0", "type": "metric", "unit": "tokens", - "value": 147 + "value": 77 }, { "attributes": { @@ -33956,16 +45754,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "IZ8Q_jX_", + "span_id": "hcq3e4Mt", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:28.484914+00:00", + "__datetime__": "2025-03-12T23:16:48.744962+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "f1UWdr3yT5CpYBm0", "type": "metric", "unit": "tokens", - "value": 290 + "value": 51 }, { "attributes": { @@ -33973,16 +45771,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "IZ8Q_jX_", + "span_id": "hcq3e4Mt", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:28.484922+00:00", + "__datetime__": "2025-03-12T23:16:48.744969+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "f1UWdr3yT5CpYBm0", "type": "metric", "unit": "tokens", - "value": 437 + "value": 128 } ] } @@ -33990,7 +45788,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -34018,7 +45816,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "The", "type": "text" }, "event_type": { @@ -34038,7 +45836,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", + "text": " function call should be", "type": "text" }, "event_type": { @@ -34058,7 +45856,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": ":\n[", "type": "text" }, "event_type": { @@ -34078,19 +45876,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": "get", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34098,11 +45885,47 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_boiling_point(liquid_name='polyjuice', celci", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "us=True)]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -34135,16 +45958,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "qLPBZlok", + "span_id": "JN7UZs_c", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209198+00:00", + "__datetime__": "2025-03-07T01:44:42.473221+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "H3r-_Zh-TVqtSp7k", "type": "metric", "unit": "tokens", - "value": 108 + "value": 86 }, { "attributes": { @@ -34152,16 +45975,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "qLPBZlok", + "span_id": "JN7UZs_c", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209239+00:00", + "__datetime__": "2025-03-07T01:44:42.473254+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "H3r-_Zh-TVqtSp7k", "type": "metric", "unit": "tokens", - "value": 23 + "value": 34 }, { "attributes": { @@ -34169,16 +45992,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "qLPBZlok", + "span_id": "JN7UZs_c", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209247+00:00", + "__datetime__": "2025-03-07T01:44:42.473261+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "H3r-_Zh-TVqtSp7k", "type": "metric", "unit": "tokens", - "value": 131 + "value": 120 } ] } @@ -34186,7 +46009,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -34214,7 +46037,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "The", "type": "text" }, "event_type": { @@ -34234,7 +46057,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What's", + "text": " function `get_boiling_point`", "type": "text" }, "event_type": { @@ -34254,7 +46077,7 @@ "data": { "event": { "delta": { - "text": " your first question about Torchtune", + "text": " is not a real function and cannot be", "type": "text" }, "event_type": { @@ -34274,7 +46097,7 @@ "data": { "event": { "delta": { - "text": "?", + "text": " used to determine the boiling point of polyju", "type": "text" }, "event_type": { @@ -34294,94 +46117,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525734+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525763+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 26 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525770+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 101 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": "ice. Polyjuice is a fictional substance from the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34395,7 +46137,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " Harry Potter series and does not have a real-world boiling", "type": "text" }, "event_type": { @@ -34415,7 +46157,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Tor", + "text": " point. If you have any other questions or need help", "type": "text" }, "event_type": { @@ -34435,7 +46177,7 @@ "data": { "event": { "delta": { - "text": "chtune documentation\")]", + "text": " with a different topic, feel free to ask!", "type": "text" }, "event_type": { @@ -34449,41 +46191,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Torchtune documentation" - }, - "call_id": "385cbde8-19e8-4c8b-84ca-b75050b3666b", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -34512,16 +46219,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "-7YS2sLl", + "span_id": "aCPTIc0d", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668846+00:00", + "__datetime__": "2025-03-07T01:53:27.227208+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "4DRyVE86RpCeqfpE", "type": "metric", "unit": "tokens", - "value": 39 + "value": 86 }, { "attributes": { @@ -34529,16 +46236,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "-7YS2sLl", + "span_id": "aCPTIc0d", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668859+00:00", + "__datetime__": "2025-03-07T01:53:27.227251+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "4DRyVE86RpCeqfpE", "type": "metric", "unit": "tokens", - "value": 20 + "value": 78 }, { "attributes": { @@ -34546,16 +46253,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "-7YS2sLl", + "span_id": "aCPTIc0d", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668861+00:00", + "__datetime__": "2025-03-07T01:53:27.227258+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "4DRyVE86RpCeqfpE", "type": "metric", "unit": "tokens", - "value": 59 + "value": 164 } ] } @@ -34563,7 +46270,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -34591,7 +46298,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": "The", "type": "text" }, "event_type": { @@ -34611,7 +46318,7 @@ "data": { "event": { "delta": { - "text": "lama3-8B uses grouped-query", + "text": " function call should be in the following format", "type": "text" }, "event_type": { @@ -34631,7 +46338,7 @@ "data": { "event": { "delta": { - "text": " attention instead of", + "text": ": [function_name(parameters)]. However", "type": "text" }, "event_type": { @@ -34651,7 +46358,7 @@ "data": { "event": { "delta": { - "text": " the standard multi-head attention.", + "text": ", the function get_boiling_point is not recognized", "type": "text" }, "event_type": { @@ -34671,94 +46378,33 @@ "data": { "event": { "delta": { - "text": "", + "text": ". If the function", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.982970+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 80 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.983000+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.983005+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 108 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " is supposed to return the boiling point of a liquid, it should be defined", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34772,7 +46418,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": " before it can be used. \n\nIn this", "type": "text" }, "event_type": { @@ -34792,7 +46438,7 @@ "data": { "event": { "delta": { - "text": "lama3-8B uses grouped-query attention instead of", + "text": " case, I will assume that the function get_boiling_point is defined as", "type": "text" }, "event_type": { @@ -34812,7 +46458,7 @@ "data": { "event": { "delta": { - "text": " the standard", + "text": " follows:\ndef get", "type": "text" }, "event_type": { @@ -34832,7 +46478,7 @@ "data": { "event": { "delta": { - "text": " multi-head attention.", + "text": "_boiling_point(liquid_name, celcius=True):\n # This", "type": "text" }, "event_type": { @@ -34852,94 +46498,53 @@ "data": { "event": { "delta": { - "text": "", + "text": " function returns the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884663+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 80 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point of a liquid in Celcius or Fahrenheit\n boiling_points", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884753+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 28 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884760+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 108 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " = {\n \"water\": 100,\n \"polyjuice\":", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34953,7 +46558,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " 120 # Assuming poly", "type": "text" }, "event_type": { @@ -34973,7 +46578,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Llama3-8", + "text": "juice has a boiling point of 120 degrees Cel", "type": "text" }, "event_type": { @@ -34993,7 +46598,7 @@ "data": { "event": { "delta": { - "text": "B attention type\")]", + "text": "cius\n }\n if liquid", "type": "text" }, "event_type": { @@ -35013,19 +46618,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "4901bbdf-8faf-4a57-b6f6-01688c6290e6", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": "_name in boiling_points:\n if celcius:\n return", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35033,11 +46627,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -35048,94 +46638,53 @@ "data": { "event": { "delta": { - "text": "", + "text": " boiling_points[liquid_name]\n else:\n return boiling_points[liquid", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412559+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 40 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412607+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 24 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_name] * 9/5 + ", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412615+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 64 - } - ] + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "32\n else:\n return \"Boiling point not found", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -35149,7 +46698,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "\"\n\nNow, the function call", "type": "text" }, "event_type": { @@ -35169,7 +46718,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Llama3-8B attention", + "text": " should be: \n", "type": "text" }, "event_type": { @@ -35189,7 +46738,7 @@ "data": { "event": { "delta": { - "text": " type\")]", + "text": "[get_boiling_point(liquid_name=\"polyju", "type": "text" }, "event_type": { @@ -35209,19 +46758,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "dd056386-b105-47e5-bd85-07e5ae096de1", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": "ice\", celcius=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35229,11 +46767,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -35266,16 +46800,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "yjKrmpeo", + "span_id": "NnkGeCwM", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041566+00:00", + "__datetime__": "2025-03-07T01:44:35.213901+00:00", "__module__": "datetime" }, - "trace_id": "liTx9auyTkyfvrBr", + "trace_id": "7ifSRjCjRIioDOte", "type": "metric", "unit": "tokens", - "value": 40 + "value": 86 }, { "attributes": { @@ -35283,16 +46817,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "yjKrmpeo", + "span_id": "NnkGeCwM", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041591+00:00", + "__datetime__": "2025-03-07T01:44:35.213925+00:00", "__module__": "datetime" }, - "trace_id": "liTx9auyTkyfvrBr", + "trace_id": "7ifSRjCjRIioDOte", "type": "metric", "unit": "tokens", - "value": 24 + "value": 234 }, { "attributes": { @@ -35300,16 +46834,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "yjKrmpeo", + "span_id": "NnkGeCwM", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041597+00:00", + "__datetime__": "2025-03-07T01:44:35.213931+00:00", "__module__": "datetime" }, - "trace_id": "liTx9auyTkyfvrBr", + "trace_id": "7ifSRjCjRIioDOte", "type": "metric", "unit": "tokens", - "value": 64 + "value": 320 } ] } @@ -35317,7 +46851,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35345,7 +46879,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[", "type": "text" }, "event_type": { @@ -35365,7 +46899,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", + "text": "get_boiling", "type": "text" }, "event_type": { @@ -35385,94 +46919,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "_point(liquid_name", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084924+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 1145 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084934+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 19 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084936+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 1164 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "='polyjuice", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -35486,13 +46959,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "', celcius", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35511,13 +46979,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "brave_search.call(query=\"current CEO of Meta\")", - "type": "tool_call" + "text": "=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35543,14 +47006,11 @@ }, "tool_call": { "arguments": { - "query": "current CEO of Meta" + "celcius": true, + "liquid_name": "polyjuice" }, - "call_id": "535c272b-768b-44fe-b303-2eae022f67f5", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "brave_search" - } + "call_id": "50ff6c7b-d098-4a3c-b299-117afe819175", + "tool_name": "get_boiling_point" }, "type": "tool_call" }, @@ -35597,16 +47057,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "AZ60Ocso", + "span_id": "c_LlCAG8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907918+00:00", + "__datetime__": "2025-03-12T23:17:00.095577+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "zeoAtcHFTnmC8N9f", "type": "metric", "unit": "tokens", - "value": 34 + "value": 30 }, { "attributes": { @@ -35614,16 +47074,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "AZ60Ocso", + "span_id": "c_LlCAG8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907933+00:00", + "__datetime__": "2025-03-12T23:17:00.095608+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "zeoAtcHFTnmC8N9f", "type": "metric", "unit": "tokens", - "value": 10 + "value": 28 }, { "attributes": { @@ -35631,16 +47091,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "AZ60Ocso", + "span_id": "c_LlCAG8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907936+00:00", + "__datetime__": "2025-03-12T23:17:00.095615+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "zeoAtcHFTnmC8N9f", "type": "metric", "unit": "tokens", - "value": 44 + "value": 58 } ] } @@ -35648,7 +47108,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35676,7 +47136,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[", "type": "text" }, "event_type": { @@ -35696,7 +47156,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius", + "text": "get_bo", "type": "text" }, "event_type": { @@ -35716,7 +47176,67 @@ "data": { "event": { "delta": { - "text": ".", + "text": "iling_point(", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "liquid_name='", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "polyjuice', celci", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "us=True)]", "type": "text" }, "event_type": { @@ -35725,7 +47245,43 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "df35163f-539a-47dc-97e3-2569a6ad92fc", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -35758,16 +47314,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "drZjZkfj", + "span_id": "jzMZxiDn", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852666+00:00", + "__datetime__": "2025-03-12T23:16:33.874568+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "3v9VtTtdSdGcu8a7", "type": "metric", "unit": "tokens", - "value": 77 + "value": 30 }, { "attributes": { @@ -35775,16 +47331,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "drZjZkfj", + "span_id": "jzMZxiDn", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852692+00:00", + "__datetime__": "2025-03-12T23:16:33.874602+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "3v9VtTtdSdGcu8a7", "type": "metric", "unit": "tokens", - "value": 23 + "value": 28 }, { "attributes": { @@ -35792,16 +47348,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "drZjZkfj", + "span_id": "jzMZxiDn", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852699+00:00", + "__datetime__": "2025-03-12T23:16:33.874608+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "3v9VtTtdSdGcu8a7", "type": "metric", "unit": "tokens", - "value": 100 + "value": 58 } ] } @@ -35809,7 +47365,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35837,7 +47393,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "Poly", "type": "text" }, "event_type": { @@ -35857,7 +47413,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius.", + "text": "juice Potion", "type": "text" }, "event_type": { @@ -35877,94 +47433,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " is a", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.617998+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 77 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.618030+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.618036+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 100 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " fictional substance from the Harry Potter", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -35978,7 +47473,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " book series by J.K", "type": "text" }, "event_type": { @@ -35998,7 +47493,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": ". Rowling. As it", "type": "text" }, "event_type": { @@ -36018,7 +47513,7 @@ "data": { "event": { "delta": { - "text": " able", + "text": "'s", "type": "text" }, "event_type": { @@ -36038,7 +47533,7 @@ "data": { "event": { "delta": { - "text": " to find the", + "text": " not a", "type": "text" }, "event_type": { @@ -36058,7 +47553,7 @@ "data": { "event": { "delta": { - "text": " boiling point of \"polyjuice\" as", + "text": " real-world substance", "type": "text" }, "event_type": { @@ -36078,7 +47573,7 @@ "data": { "event": { "delta": { - "text": " it", + "text": ", it doesn't have a", "type": "text" }, "event_type": { @@ -36098,7 +47593,7 @@ "data": { "event": { "delta": { - "text": " is not a real liquid", + "text": " boiling point or", "type": "text" }, "event_type": { @@ -36118,7 +47613,7 @@ "data": { "event": { "delta": { - "text": ". Polyju", + "text": " any", "type": "text" }, "event_type": { @@ -36138,7 +47633,7 @@ "data": { "event": { "delta": { - "text": "ice is a fictional substance from the", + "text": " other physical properties that can", "type": "text" }, "event_type": { @@ -36158,7 +47653,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series.", + "text": " be measured.\n\n", "type": "text" }, "event_type": { @@ -36178,94 +47673,53 @@ "data": { "event": { "delta": { - "text": "", + "text": "In the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232189+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 77 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Harry Potter universe,", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232325+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 51 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232334+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 128 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " Polyjuice Potion", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -36279,7 +47733,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " is a", "type": "text" }, "event_type": { @@ -36299,7 +47753,7 @@ "data": { "event": { "delta": { - "text": " function call should be", + "text": " magical brew that", "type": "text" }, "event_type": { @@ -36319,7 +47773,7 @@ "data": { "event": { "delta": { - "text": ":\n[", + "text": " allows the drinker", "type": "text" }, "event_type": { @@ -36339,7 +47793,7 @@ "data": { "event": { "delta": { - "text": "get", + "text": " to assume", "type": "text" }, "event_type": { @@ -36359,7 +47813,7 @@ "data": { "event": { "delta": { - "text": "_boiling_point(liquid_name='polyjuice', celci", + "text": " the form and", "type": "text" }, "event_type": { @@ -36379,7 +47833,7 @@ "data": { "event": { "delta": { - "text": "us=True)]", + "text": " appearance of another person", "type": "text" }, "event_type": { @@ -36399,94 +47853,53 @@ "data": { "event": { "delta": { - "text": "", + "text": ". Its properties", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "JN7UZs_c", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473221+00:00", - "__module__": "datetime" - }, - "trace_id": "H3r-_Zh-TVqtSp7k", - "type": "metric", - "unit": "tokens", - "value": 86 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and behavior are", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "JN7UZs_c", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473254+00:00", - "__module__": "datetime" - }, - "trace_id": "H3r-_Zh-TVqtSp7k", - "type": "metric", - "unit": "tokens", - "value": 34 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "JN7UZs_c", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473261+00:00", - "__module__": "datetime" - }, - "trace_id": "H3r-_Zh-TVqtSp7k", - "type": "metric", - "unit": "tokens", - "value": 120 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " governed by the rules of magic within the fictional world, rather than", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -36500,7 +47913,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " by the laws", "type": "text" }, "event_type": { @@ -36520,7 +47933,7 @@ "data": { "event": { "delta": { - "text": " function `get_boiling_point`", + "text": " of physics and chemistry", "type": "text" }, "event_type": { @@ -36540,7 +47953,7 @@ "data": { "event": { "delta": { - "text": " is not a real function and cannot be", + "text": " that apply to real", "type": "text" }, "event_type": { @@ -36560,7 +47973,7 @@ "data": { "event": { "delta": { - "text": " used to determine the boiling point of polyju", + "text": "-world substances.\n\nSo", "type": "text" }, "event_type": { @@ -36580,7 +47993,7 @@ "data": { "event": { "delta": { - "text": "ice. Polyjuice is a fictional substance from the", + "text": ", I", "type": "text" }, "event_type": { @@ -36600,7 +48013,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series and does not have a real-world boiling", + "text": "'m afraid", "type": "text" }, "event_type": { @@ -36620,7 +48033,7 @@ "data": { "event": { "delta": { - "text": " point. If you have any other questions or need help", + "text": " there's no", "type": "text" }, "event_type": { @@ -36640,7 +48053,7 @@ "data": { "event": { "delta": { - "text": " with a different topic, feel free to ask!", + "text": " boiling point to report", "type": "text" }, "event_type": { @@ -36660,94 +48073,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " for Polyjuice", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "aCPTIc0d", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227208+00:00", - "__module__": "datetime" - }, - "trace_id": "4DRyVE86RpCeqfpE", - "type": "metric", - "unit": "tokens", - "value": 86 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "aCPTIc0d", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227251+00:00", - "__module__": "datetime" - }, - "trace_id": "4DRyVE86RpCeqfpE", - "type": "metric", - "unit": "tokens", - "value": 78 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "aCPTIc0d", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227258+00:00", - "__module__": "datetime" - }, - "trace_id": "4DRyVE86RpCeqfpE", - "type": "metric", - "unit": "tokens", - "value": 164 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " Potion!", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -36761,7 +48113,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " Would you like to", "type": "text" }, "event_type": { @@ -36781,7 +48133,7 @@ "data": { "event": { "delta": { - "text": " function call should be in the following format", + "text": " know more about the", "type": "text" }, "event_type": { @@ -36801,7 +48153,7 @@ "data": { "event": { "delta": { - "text": ": [function_name(parameters)]. However", + "text": " Harry Potter series or", "type": "text" }, "event_type": { @@ -36821,7 +48173,7 @@ "data": { "event": { "delta": { - "text": ", the function get_boiling_point is not recognized", + "text": " is there something else", "type": "text" }, "event_type": { @@ -36841,7 +48193,7 @@ "data": { "event": { "delta": { - "text": ". If the function", + "text": " I can help you", "type": "text" }, "event_type": { @@ -36861,7 +48213,7 @@ "data": { "event": { "delta": { - "text": " is supposed to return the boiling point of a liquid, it should be defined", + "text": " with?", "type": "text" }, "event_type": { @@ -36881,33 +48233,94 @@ "data": { "event": { "delta": { - "text": " before it can be used. \n\nIn this", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "MD9yQkRd", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:55.850173+00:00", + "__module__": "datetime" + }, + "trace_id": "fh6SDMFUQtK_wjC3", + "type": "metric", + "unit": "tokens", + "value": 30 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "MD9yQkRd", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:55.850206+00:00", + "__module__": "datetime" + }, + "trace_id": "fh6SDMFUQtK_wjC3", + "type": "metric", + "unit": "tokens", + "value": 157 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "MD9yQkRd", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:55.850213+00:00", + "__module__": "datetime" + }, + "trace_id": "fh6SDMFUQtK_wjC3", + "type": "metric", + "unit": "tokens", + "value": 187 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " case, I will assume that the function get_boiling_point is defined as", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36921,7 +48334,7 @@ "data": { "event": { "delta": { - "text": " follows:\ndef get", + "text": "[", "type": "text" }, "event_type": { @@ -36941,7 +48354,7 @@ "data": { "event": { "delta": { - "text": "_boiling_point(liquid_name, celcius=True):\n # This", + "text": "get_boiling_point(liquid", "type": "text" }, "event_type": { @@ -36961,7 +48374,7 @@ "data": { "event": { "delta": { - "text": " function returns the", + "text": "_name='polyjuice',", "type": "text" }, "event_type": { @@ -36981,7 +48394,7 @@ "data": { "event": { "delta": { - "text": " boiling point of a liquid in Celcius or Fahrenheit\n boiling_points", + "text": " celcius=True)]", "type": "text" }, "event_type": { @@ -37001,8 +48414,20 @@ "data": { "event": { "delta": { - "text": " = {\n \"water\": 100,\n \"polyjuice\":", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "42a89a20-0a36-41cc-83a0-2725428f91b7", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37010,7 +48435,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -37021,13 +48450,94 @@ "data": { "event": { "delta": { - "text": " 120 # Assuming poly", + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "YObSruYs", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:46.763088+00:00", + "__module__": "datetime" + }, + "trace_id": "ovdyCmhfRdG3MKrj", + "type": "metric", + "unit": "tokens", + "value": 30 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "YObSruYs", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:46.763121+00:00", + "__module__": "datetime" + }, + "trace_id": "ovdyCmhfRdG3MKrj", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "YObSruYs", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-12T23:16:46.763131+00:00", + "__module__": "datetime" + }, + "trace_id": "ovdyCmhfRdG3MKrj", + "type": "metric", + "unit": "tokens", + "value": 58 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37041,7 +48551,7 @@ "data": { "event": { "delta": { - "text": "juice has a boiling point of 120 degrees Cel", + "text": "The", "type": "text" }, "event_type": { @@ -37061,7 +48571,7 @@ "data": { "event": { "delta": { - "text": "cius\n }\n if liquid", + "text": " code defines two", "type": "text" }, "event_type": { @@ -37081,7 +48591,7 @@ "data": { "event": { "delta": { - "text": "_name in boiling_points:\n if celcius:\n return", + "text": " functions: `is_prime(n", "type": "text" }, "event_type": { @@ -37101,7 +48611,7 @@ "data": { "event": { "delta": { - "text": " boiling_points[liquid_name]\n else:\n return boiling_points[liquid", + "text": ")` checks if", "type": "text" }, "event_type": { @@ -37121,7 +48631,7 @@ "data": { "event": { "delta": { - "text": "_name] * 9/5 + ", + "text": " a number `n` is", "type": "text" }, "event_type": { @@ -37141,7 +48651,7 @@ "data": { "event": { "delta": { - "text": "32\n else:\n return \"Boiling point not found", + "text": " prime, and `nth_prime", "type": "text" }, "event_type": { @@ -37161,7 +48671,7 @@ "data": { "event": { "delta": { - "text": "\"\n\nNow, the function call", + "text": "(n)` finds the `n", "type": "text" }, "event_type": { @@ -37181,7 +48691,7 @@ "data": { "event": { "delta": { - "text": " should be: \n", + "text": "`th prime number. The", "type": "text" }, "event_type": { @@ -37201,7 +48711,7 @@ "data": { "event": { "delta": { - "text": "[get_boiling_point(liquid_name=\"polyju", + "text": " `is_prime(n)` function checks", "type": "text" }, "event_type": { @@ -37221,7 +48731,7 @@ "data": { "event": { "delta": { - "text": "ice\", celcius=True)]", + "text": " if", "type": "text" }, "event_type": { @@ -37241,94 +48751,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "NnkGeCwM", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213901+00:00", - "__module__": "datetime" - }, - "trace_id": "7ifSRjCjRIioDOte", - "type": "metric", - "unit": "tokens", - "value": 86 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "NnkGeCwM", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213925+00:00", - "__module__": "datetime" - }, - "trace_id": "7ifSRjCjRIioDOte", - "type": "metric", - "unit": "tokens", - "value": 234 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "NnkGeCwM", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213931+00:00", - "__module__": "datetime" - }, - "trace_id": "7ifSRjCjRIioDOte", - "type": "metric", - "unit": "tokens", - "value": 320 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " `n", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -37342,7 +48771,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "` is less than or equal", "type": "text" }, "event_type": { @@ -37362,7 +48791,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": " to 1 (not", "type": "text" }, "event_type": { @@ -37382,7 +48811,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": " prime), less than or equal", "type": "text" }, "event_type": { @@ -37402,20 +48831,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "d43b2636-903d-430d-8389-91eefe5a1d75", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " to 3", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37423,11 +48840,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -37438,94 +48851,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221646+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221673+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221680+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", - "value": 58 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " (prime), or if it", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -37539,7 +48871,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "'s divisible by 2 or", "type": "text" }, "event_type": { @@ -37559,7 +48891,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name", + "text": " 3 (not prime).", "type": "text" }, "event_type": { @@ -37579,7 +48911,7 @@ "data": { "event": { "delta": { - "text": "='polyjuice', celcius=True)]", + "text": " If none of", "type": "text" }, "event_type": { @@ -37599,20 +48931,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "0548b2ef-daa4-4099-bb2c-b34f00752339", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " these conditions are met, it", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37620,11 +48940,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -37635,94 +48951,53 @@ "data": { "event": { "delta": { - "text": "", + "text": " checks divisibility", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366139+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366166+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", - "value": 28 + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " by numbers", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366172+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", - "value": 58 - } - ] + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " of the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -37736,7 +49011,7 @@ "data": { "event": { "delta": { - "text": "Poly", + "text": " form 6k \u00b1 ", "type": "text" }, "event_type": { @@ -37756,7 +49031,7 @@ "data": { "event": { "delta": { - "text": "juice is a fictional potion from", + "text": "1, where", "type": "text" }, "event_type": { @@ -37776,7 +49051,7 @@ "data": { "event": { "delta": { - "text": " the Harry Potter series by J.K. Rowling. As it", + "text": " k is an integer. The", "type": "text" }, "event_type": { @@ -37796,7 +49071,7 @@ "data": { "event": { "delta": { - "text": "'s not a real substance, it doesn't have a boiling point", + "text": " `nth_prime(n", "type": "text" }, "event_type": { @@ -37816,7 +49091,7 @@ "data": { "event": { "delta": { - "text": ". Polyjuice Potion is a magical concoction", + "text": ")` function iterates", "type": "text" }, "event_type": { @@ -37836,7 +49111,7 @@ "data": { "event": { "delta": { - "text": " that allows the drinker to assume the form and", + "text": " through numbers", "type": "text" }, "event_type": { @@ -37856,7 +49131,7 @@ "data": { "event": { "delta": { - "text": " appearance", + "text": " starting from 2, checks", "type": "text" }, "event_type": { @@ -37876,7 +49151,7 @@ "data": { "event": { "delta": { - "text": " of another person, but it's not a physical substance that can", + "text": " if each number is prime using", "type": "text" }, "event_type": { @@ -37896,7 +49171,7 @@ "data": { "event": { "delta": { - "text": " be measured or analyzed in the same way as real-world", + "text": " the `is_prime(n", "type": "text" }, "event_type": { @@ -37916,7 +49191,7 @@ "data": { "event": { "delta": { - "text": " chemicals.\n\nIf you", + "text": ")` function, and increments a", "type": "text" }, "event_type": { @@ -37936,7 +49211,7 @@ "data": { "event": { "delta": { - "text": " have any other questions or", + "text": " counter until it", "type": "text" }, "event_type": { @@ -37956,7 +49231,7 @@ "data": { "event": { "delta": { - "text": " if there's anything else I can help you with, feel free to ask", + "text": " finds the `n`th", "type": "text" }, "event_type": { @@ -37976,7 +49251,7 @@ "data": { "event": { "delta": { - "text": "!", + "text": " prime number.", "type": "text" }, "event_type": { @@ -38018,16 +49293,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "M0oC9v8Y", + "span_id": "DP8eFcnZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531648+00:00", + "__datetime__": "2025-03-12T23:16:11.055365+00:00", "__module__": "datetime" }, - "trace_id": "0CMlh2kQShSVm3zE", + "trace_id": "eUBw_VOpS32wNAGH", "type": "metric", "unit": "tokens", - "value": 30 + "value": 252 }, { "attributes": { @@ -38035,16 +49310,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "M0oC9v8Y", + "span_id": "DP8eFcnZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531666+00:00", + "__datetime__": "2025-03-12T23:16:11.055474+00:00", "__module__": "datetime" }, - "trace_id": "0CMlh2kQShSVm3zE", + "trace_id": "eUBw_VOpS32wNAGH", "type": "metric", "unit": "tokens", - "value": 113 + "value": 171 }, { "attributes": { @@ -38052,16 +49327,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "M0oC9v8Y", + "span_id": "DP8eFcnZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531671+00:00", + "__datetime__": "2025-03-12T23:16:11.055486+00:00", "__module__": "datetime" }, - "trace_id": "0CMlh2kQShSVm3zE", + "trace_id": "eUBw_VOpS32wNAGH", "type": "metric", "unit": "tokens", - "value": 143 + "value": 423 } ] } @@ -38069,7 +49344,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -38097,8 +49372,13 @@ "data": { "event": { "delta": { - "text": "[", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38117,8 +49397,13 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice', cel", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "def is_prime(n):\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38137,8 +49422,13 @@ "data": { "event": { "delta": { - "text": "cius=True)]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if n <= 1:\n return", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38160,16 +49450,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "acbb04a1-08f4-4277-9b66-aadda2fa2be7", - "tool_name": "get_boiling_point" + "value": "in_progress" }, + "tool_call": " False\n if", "type": "tool_call" }, "event_type": { @@ -38178,11 +49461,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -38193,94 +49472,143 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " n <= 3:\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175063+00:00", - "__module__": "datetime" + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", - "value": 30 + "tool_call": " return True\n if n", + "type": "tool_call" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175128+00:00", - "__module__": "datetime" + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", - "value": 28 + "tool_call": " % 2 == 0", + "type": "tool_call" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" }, - "metric": "total_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175137+00:00", - "__module__": "datetime" + "tool_call": " or n % 3 ==", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", - "value": 58 - } - ] + "tool_call": " 0:\n return False", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n i", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -38294,8 +49622,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " = 5", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38314,8 +49647,13 @@ "data": { "event": { "delta": { - "text": " 100th prime number is 541", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n while i * i", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38334,8 +49672,13 @@ "data": { "event": { "delta": { - "text": ".", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " <= n:\n if n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38354,94 +49697,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " % i == 0 or", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404182+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 252 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404224+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 20 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404230+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 272 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " n % (i + 2", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -38458,9 +49750,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "started" + "value": "in_progress" }, - "tool_call": "", + "tool_call": ") == 0:\n return False", "type": "tool_call" }, "event_type": { @@ -38485,7 +49777,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "def is_prime(n):\n if n <= 1:\n return False", + "tool_call": "\n i +=", "type": "tool_call" }, "event_type": { @@ -38510,7 +49802,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if n <= 3:\n return True", + "tool_call": " 6\n return True", "type": "tool_call" }, "event_type": { @@ -38535,7 +49827,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if n % 2 == 0 or n % 3", + "tool_call": "\n\ndef nth_prime(n):\n", "type": "tool_call" }, "event_type": { @@ -38560,7 +49852,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " == 0:\n return False\n i = 5\n ", + "tool_call": " count = 0\n", "type": "tool_call" }, "event_type": { @@ -38585,7 +49877,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " while i * i <= n:\n if n", + "tool_call": " num = 2\n", "type": "tool_call" }, "event_type": { @@ -38610,7 +49902,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " % i == 0 or n % (i", + "tool_call": " while True:\n if", "type": "tool_call" }, "event_type": { @@ -38635,7 +49927,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " + 2) == 0:\n return False\n i +=", + "tool_call": " is_prime(num):\n", "type": "tool_call" }, "event_type": { @@ -38660,7 +49952,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 6\n return True\n\ndef nth_prime(n):\n count =", + "tool_call": " count += 1\n ", "type": "tool_call" }, "event_type": { @@ -38685,7 +49977,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 0\n num = 2\n while True:\n if", + "tool_call": " if count == n:\n ", "type": "tool_call" }, "event_type": { @@ -38710,7 +50002,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " is_prime(num):\n count += 1\n if count == n", + "tool_call": " return num\n num +=", "type": "tool_call" }, "event_type": { @@ -38735,7 +50027,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ":\n return num\n num += 1\n\nprint(nth_prime", + "tool_call": " 1\n\nprint(nth", "type": "tool_call" }, "event_type": { @@ -38760,7 +50052,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "(100))", + "tool_call": "_prime(100))", "type": "tool_call" }, "event_type": { @@ -38789,7 +50081,7 @@ "arguments": { "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" }, - "call_id": "e1110bc1-dc83-480d-ad33-09d49f5ccc8d", + "call_id": "9859b184-8882-4553-8e81-97c304a4fa9b", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -38841,13 +50133,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "5J3hM-La", + "span_id": "coI936YN", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121100+00:00", + "__datetime__": "2025-03-12T23:16:07.172040+00:00", "__module__": "datetime" }, - "trace_id": "snO106yxStaL10ow", + "trace_id": "eUBw_VOpS32wNAGH", "type": "metric", "unit": "tokens", "value": 40 @@ -38858,13 +50150,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "5J3hM-La", + "span_id": "coI936YN", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121127+00:00", + "__datetime__": "2025-03-12T23:16:07.172411+00:00", "__module__": "datetime" }, - "trace_id": "snO106yxStaL10ow", + "trace_id": "eUBw_VOpS32wNAGH", "type": "metric", "unit": "tokens", "value": 10 @@ -38875,13 +50167,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "5J3hM-La", + "span_id": "coI936YN", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121132+00:00", + "__datetime__": "2025-03-12T23:16:07.172422+00:00", "__module__": "datetime" }, - "trace_id": "snO106yxStaL10ow", + "trace_id": "eUBw_VOpS32wNAGH", "type": "metric", "unit": "tokens", "value": 50 @@ -38940,7 +50232,47 @@ "data": { "event": { "delta": { - "text": "plexity the company was founded in 2022.", + "text": "plexity the company", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " was founded in 2022", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", "type": "text" }, "event_type": { @@ -38982,13 +50314,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "6jxCq3gU", + "span_id": "85ppLY3v", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430436+00:00", + "__datetime__": "2025-03-12T23:18:01.989283+00:00", "__module__": "datetime" }, - "trace_id": "XhZWljYTTDCYF7vI", + "trace_id": "he4nc6x1QZ6pWLtN", "type": "metric", "unit": "tokens", "value": 68 @@ -38999,13 +50331,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "6jxCq3gU", + "span_id": "85ppLY3v", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430477+00:00", + "__datetime__": "2025-03-12T23:18:01.989312+00:00", "__module__": "datetime" }, - "trace_id": "XhZWljYTTDCYF7vI", + "trace_id": "he4nc6x1QZ6pWLtN", "type": "metric", "unit": "tokens", "value": 22 @@ -39016,13 +50348,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "6jxCq3gU", + "span_id": "85ppLY3v", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430489+00:00", + "__datetime__": "2025-03-12T23:18:01.989316+00:00", "__module__": "datetime" }, - "trace_id": "XhZWljYTTDCYF7vI", + "trace_id": "he4nc6x1QZ6pWLtN", "type": "metric", "unit": "tokens", "value": 90 @@ -39081,7 +50413,27 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Perplexity the company", + "text": "nowledge_search(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Perplexity the company", "type": "text" }, "event_type": { @@ -39130,7 +50482,7 @@ "arguments": { "query": "Perplexity the company founding date" }, - "call_id": "199ef050-bc11-4e4b-935d-f5241c3f40ef", + "call_id": "5845398b-1978-4138-9e72-800b65cf8fe7", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -39178,13 +50530,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "m4wMGuSN", + "span_id": "V20HYcJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880525+00:00", + "__datetime__": "2025-03-12T23:18:01.118353+00:00", "__module__": "datetime" }, - "trace_id": "XhZWljYTTDCYF7vI", + "trace_id": "he4nc6x1QZ6pWLtN", "type": "metric", "unit": "tokens", "value": 29 @@ -39195,13 +50547,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "m4wMGuSN", + "span_id": "V20HYcJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880576+00:00", + "__datetime__": "2025-03-12T23:18:01.118394+00:00", "__module__": "datetime" }, - "trace_id": "XhZWljYTTDCYF7vI", + "trace_id": "he4nc6x1QZ6pWLtN", "type": "metric", "unit": "tokens", "value": 23 @@ -39212,13 +50564,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "m4wMGuSN", + "span_id": "V20HYcJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880585+00:00", + "__datetime__": "2025-03-12T23:18:01.118408+00:00", "__module__": "datetime" }, - "trace_id": "XhZWljYTTDCYF7vI", + "trace_id": "he4nc6x1QZ6pWLtN", "type": "metric", "unit": "tokens", "value": 52 @@ -39277,7 +50629,7 @@ "data": { "event": { "delta": { - "text": " NBA was created on August 3, 1949, with", + "text": " NBA was created on August ", "type": "text" }, "event_type": { @@ -39297,7 +50649,7 @@ "data": { "event": { "delta": { - "text": " the merger of the Basketball Association of America (BAA) and the National", + "text": "3, 1949,", "type": "text" }, "event_type": { @@ -39317,7 +50669,87 @@ "data": { "event": { "delta": { - "text": " Basketball League (NBL).", + "text": " with the merger of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the Basketball Association of America (", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "BAA) and the National", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Basketball League (N", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "BL).", "type": "text" }, "event_type": { @@ -39359,13 +50791,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "OyfVMRgR", + "span_id": "2mRpWtE_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322420+00:00", + "__datetime__": "2025-03-12T23:18:08.397090+00:00", "__module__": "datetime" }, - "trace_id": "TMrhR55CR-KrmGp0", + "trace_id": "Xa0aO1SdQOiEqarh", "type": "metric", "unit": "tokens", "value": 63 @@ -39376,13 +50808,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "OyfVMRgR", + "span_id": "2mRpWtE_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322482+00:00", + "__datetime__": "2025-03-12T23:18:08.397117+00:00", "__module__": "datetime" }, - "trace_id": "TMrhR55CR-KrmGp0", + "trace_id": "Xa0aO1SdQOiEqarh", "type": "metric", "unit": "tokens", "value": 45 @@ -39393,13 +50825,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "OyfVMRgR", + "span_id": "2mRpWtE_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322490+00:00", + "__datetime__": "2025-03-12T23:18:08.397124+00:00", "__module__": "datetime" }, - "trace_id": "TMrhR55CR-KrmGp0", + "trace_id": "Xa0aO1SdQOiEqarh", "type": "metric", "unit": "tokens", "value": 108 @@ -39458,7 +50890,27 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"NBA creation date\")]", + "text": "nowledge_search(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "NBA creation date\")]", "type": "text" }, "event_type": { @@ -39487,7 +50939,7 @@ "arguments": { "query": "NBA creation date" }, - "call_id": "388e55ab-448a-4a98-905b-196c051bdeea", + "call_id": "5855dd61-6243-4922-a110-f072de222c69", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -39535,13 +50987,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "QpFMmy3B", + "span_id": "Rn6uhM71", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235138+00:00", + "__datetime__": "2025-03-12T23:18:03.628231+00:00", "__module__": "datetime" }, - "trace_id": "TMrhR55CR-KrmGp0", + "trace_id": "Xa0aO1SdQOiEqarh", "type": "metric", "unit": "tokens", "value": 27 @@ -39552,13 +51004,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "QpFMmy3B", + "span_id": "Rn6uhM71", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235160+00:00", + "__datetime__": "2025-03-12T23:18:03.628288+00:00", "__module__": "datetime" }, - "trace_id": "TMrhR55CR-KrmGp0", + "trace_id": "Xa0aO1SdQOiEqarh", "type": "metric", "unit": "tokens", "value": 20 @@ -39569,13 +51021,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "QpFMmy3B", + "span_id": "Rn6uhM71", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235165+00:00", + "__datetime__": "2025-03-12T23:18:03.628296+00:00", "__module__": "datetime" }, - "trace_id": "TMrhR55CR-KrmGp0", + "trace_id": "Xa0aO1SdQOiEqarh", "type": "metric", "unit": "tokens", "value": 47 diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index 76191e992f..ab9793303f 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -64,6 +64,19 @@ } } }, + "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -374,23 +387,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:24443\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:c553d\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:4bcdb\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:b49f7\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:c553d\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -402,11 +415,11 @@ "error_message": null, "metadata": { "document_ids": [ - "24443dfb-a0b3-4ce8-820e-3fb1f12364bb", - "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", - "b49f7985-6615-4dcf-99be-d1765b6a6fc6", - "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", - "b49f7985-6615-4dcf-99be-d1765b6a6fc6" + "7f5245b0-58f9-44bb-8047-1c5e6c943496", + "4bcdb054-da63-48cb-b9c6-c38397569929", + "c553db3b-8eee-4027-b5b5-696fa7b6505e", + "4bcdb054-da63-48cb-b9c6-c38397569929", + "c553db3b-8eee-4027-b5b5-696fa7b6505e" ] } } @@ -418,7 +431,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\", \"url\": \"https://observer.com/2024/01/meta-facebook-top-executives/\", \"content\": \"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\", \"score\": 0.45536873, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", + "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", "error_code": null, "error_message": null, "metadata": null @@ -437,23 +450,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:20e5d\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:4bcdb\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:20e5d\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:4bcdb\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:20e5d\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe Date: Wed, 12 Mar 2025 16:23:36 -0700 Subject: [PATCH 03/14] pre --- tests/integration/scoring/test_scoring.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index 86415c2936..6c2623705e 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -113,7 +113,7 @@ def test_scoring_score(llama_stack_client): for x in scoring_functions: assert x in response.results assert len(response.results[x].score_rows) == 5 - + llama_stack_client.datasets.unregister("test_dataset") @@ -158,7 +158,7 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge for x in scoring_functions: assert x in response.results assert len(response.results[x].score_rows) == 5 - + llama_stack_client.datasets.unregister("test_dataset") @@ -228,4 +228,4 @@ def test_scoring_score_with_aggregation_functions( assert len(response.results[x].score_rows) == len(rows.rows) assert len(response.results[x].aggregated_results) == len(aggr_fns) - llama_stack_client.datasets.unregister("test_dataset") \ No newline at end of file + llama_stack_client.datasets.unregister("test_dataset") From d695d26b77a8ea253b1bc180acfef07bc3d0ca7d Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 16:27:07 -0700 Subject: [PATCH 04/14] ci --- .../recorded_responses/chat_completion.json | 17324 +++------------- .../recorded_responses/invoke_tool.json | 55 +- 2 files changed, 2957 insertions(+), 14422 deletions(-) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 9148386248..7234b6c31d 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -26738,7 +26738,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -26766,7 +26766,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -26786,7 +26786,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid", + "text": " provided function definitions", "type": "text" }, "event_type": { @@ -26806,7 +26806,7 @@ "data": { "event": { "delta": { - "text": "_name=\"polyjuice\",", + "text": " are not suitable", "type": "text" }, "event_type": { @@ -26826,7 +26826,7 @@ "data": { "event": { "delta": { - "text": " celcius=True)]", + "text": " for this task. Please re", "type": "text" }, "event_type": { @@ -26846,20 +26846,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "a985b4ae-b665-4931-baea-8dc633a063a4", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": "work them to", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -26867,11 +26855,27 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " align with the task requirements.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -26904,16 +26908,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "GzGznnkt", + "span_id": "D2n_IS_8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:44.978488+00:00", + "__datetime__": "2025-03-07T02:03:32.021393+00:00", "__module__": "datetime" }, - "trace_id": "78cDdNNeSnusAfVf", + "trace_id": "amAiZv5PQKSsA74j", "type": "metric", "unit": "tokens", - "value": 231 + "value": 90 }, { "attributes": { @@ -26921,16 +26925,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "GzGznnkt", + "span_id": "D2n_IS_8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:44.978521+00:00", + "__datetime__": "2025-03-07T02:03:32.021420+00:00", "__module__": "datetime" }, - "trace_id": "78cDdNNeSnusAfVf", + "trace_id": "amAiZv5PQKSsA74j", "type": "metric", "unit": "tokens", - "value": 28 + "value": 32 }, { "attributes": { @@ -26938,16 +26942,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "GzGznnkt", + "span_id": "D2n_IS_8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:44.978527+00:00", + "__datetime__": "2025-03-07T02:03:32.021427+00:00", "__module__": "datetime" }, - "trace_id": "78cDdNNeSnusAfVf", + "trace_id": "amAiZv5PQKSsA74j", "type": "metric", "unit": "tokens", - "value": 259 + "value": 122 } ] } @@ -26955,7 +26959,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27003,67 +27007,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "(liquid_name=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "polyjuice\",", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " celcius=False", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -27083,7 +27027,7 @@ "data": { "event": { "delta": { - "text": ")]", + "text": "=True)]", "type": "text" }, "event_type": { @@ -27110,10 +27054,10 @@ }, "tool_call": { "arguments": { - "celcius": false, + "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "4736e424-7686-434d-8365-e1ecd942772e", + "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27161,16 +27105,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "W_ToMqBJ", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:43.429519+00:00", + "__datetime__": "2025-03-07T01:44:31.335148+00:00", "__module__": "datetime" }, - "trace_id": "KJPwXGhmQuK-RWvz", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", - "value": 184 + "value": 267 }, { "attributes": { @@ -27178,13 +27122,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "W_ToMqBJ", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:43.429546+00:00", + "__datetime__": "2025-03-07T01:44:31.335179+00:00", "__module__": "datetime" }, - "trace_id": "KJPwXGhmQuK-RWvz", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", "value": 28 @@ -27195,16 +27139,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "W_ToMqBJ", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:43.429552+00:00", + "__datetime__": "2025-03-07T01:44:31.335185+00:00", "__module__": "datetime" }, - "trace_id": "KJPwXGhmQuK-RWvz", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", - "value": 212 + "value": 295 } ] } @@ -27212,7 +27156,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": false, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-212\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27260,67 +27204,7 @@ "data": { "event": { "delta": { - "text": "get_boiling", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "_point(liquid_name", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "=\"polyjuice", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\", celcius", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -27370,7 +27254,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "377375df-0ee4-4a96-b7c7-9b26aa6bf7e4", + "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27418,16 +27302,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "kv_YUQAA", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:41.422850+00:00", + "__datetime__": "2025-03-07T01:44:29.708270+00:00", "__module__": "datetime" }, - "trace_id": "ynpACzVfQD6zAEOf", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", - "value": 137 + "value": 211 }, { "attributes": { @@ -27435,13 +27319,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "kv_YUQAA", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:41.422897+00:00", + "__datetime__": "2025-03-07T01:44:29.708281+00:00", "__module__": "datetime" }, - "trace_id": "ynpACzVfQD6zAEOf", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", "value": 28 @@ -27452,16 +27336,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "kv_YUQAA", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:41.422904+00:00", + "__datetime__": "2025-03-07T01:44:29.708284+00:00", "__module__": "datetime" }, - "trace_id": "ynpACzVfQD6zAEOf", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", - "value": 165 + "value": 239 } ] } @@ -27469,7 +27353,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27517,67 +27401,7 @@ "data": { "event": { "delta": { - "text": "get_boiling", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "_point(liquid_name", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "=\"polyjuice", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\", celcius", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -27597,7 +27421,7 @@ "data": { "event": { "delta": { - "text": "=False)]", + "text": "=True)]", "type": "text" }, "event_type": { @@ -27624,10 +27448,10 @@ }, "tool_call": { "arguments": { - "celcius": false, + "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "8aa692c5-bc2e-4e77-80ca-749b27386818", + "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27675,16 +27499,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "pNtd6Xcf", + "span_id": "TDJHPVDZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:39.488074+00:00", + "__datetime__": "2025-03-07T01:44:28.195776+00:00", "__module__": "datetime" }, - "trace_id": "G34m9Yx8QIG_hNx-", + "trace_id": "r2GKj8iqTYaNxTeq", "type": "metric", "unit": "tokens", - "value": 90 + "value": 155 }, { "attributes": { @@ -27692,13 +27516,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "pNtd6Xcf", + "span_id": "TDJHPVDZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:39.488109+00:00", + "__datetime__": "2025-03-07T01:44:28.195808+00:00", "__module__": "datetime" }, - "trace_id": "G34m9Yx8QIG_hNx-", + "trace_id": "r2GKj8iqTYaNxTeq", "type": "metric", "unit": "tokens", "value": 28 @@ -27709,16 +27533,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "pNtd6Xcf", + "span_id": "TDJHPVDZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:39.488115+00:00", + "__datetime__": "2025-03-07T01:44:28.195814+00:00", "__module__": "datetime" }, - "trace_id": "G34m9Yx8QIG_hNx-", + "trace_id": "r2GKj8iqTYaNxTeq", "type": "metric", "unit": "tokens", - "value": 118 + "value": 183 } ] } @@ -27726,7 +27550,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27824,7 +27648,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", + "call_id": "135d468e-6391-401d-a3c0-3b08c3a6eb8c", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27872,16 +27696,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "YhFB39Ik", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335148+00:00", + "__datetime__": "2025-03-07T01:47:51.321089+00:00", "__module__": "datetime" }, - "trace_id": "3n2xEtjLQt6ZGVR_", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", - "value": 267 + "value": 99 }, { "attributes": { @@ -27889,13 +27713,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "YhFB39Ik", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335179+00:00", + "__datetime__": "2025-03-07T01:47:51.321130+00:00", "__module__": "datetime" }, - "trace_id": "3n2xEtjLQt6ZGVR_", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", "value": 28 @@ -27906,16 +27730,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "YhFB39Ik", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335185+00:00", + "__datetime__": "2025-03-07T01:47:51.321140+00:00", "__module__": "datetime" }, - "trace_id": "3n2xEtjLQt6ZGVR_", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", - "value": 295 + "value": 127 } ] } @@ -27923,7 +27747,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27971,7 +27795,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -27991,7 +27815,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -28021,7 +27845,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", + "call_id": "3955f756-9aa0-433f-be8f-af8941c220de", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -28069,16 +27893,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "lnqeV_cZ", + "span_id": "QZ6PSGpT", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708270+00:00", + "__datetime__": "2025-03-07T02:03:29.629456+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "M72bosg8TBe3uhx3", "type": "metric", "unit": "tokens", - "value": 211 + "value": 43 }, { "attributes": { @@ -28086,13 +27910,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "lnqeV_cZ", + "span_id": "QZ6PSGpT", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708281+00:00", + "__datetime__": "2025-03-07T02:03:29.629488+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "M72bosg8TBe3uhx3", "type": "metric", "unit": "tokens", "value": 28 @@ -28103,16 +27927,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "lnqeV_cZ", + "span_id": "QZ6PSGpT", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708284+00:00", + "__datetime__": "2025-03-07T02:03:29.629494+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "M72bosg8TBe3uhx3", "type": "metric", "unit": "tokens", - "value": 239 + "value": 71 } ] } @@ -28120,7 +27944,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28148,7 +27972,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -28168,7 +27992,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": " function call returned an", "type": "text" }, "event_type": { @@ -28188,7 +28012,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": " error since", "type": "text" }, "event_type": { @@ -28208,20 +28032,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " \"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -28229,11 +28041,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -28244,94 +28052,53 @@ "data": { "event": { "delta": { - "text": "", + "text": "polyjuice\" is", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "TDJHPVDZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195776+00:00", - "__module__": "datetime" - }, - "trace_id": "r2GKj8iqTYaNxTeq", - "type": "metric", - "unit": "tokens", - "value": 155 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not a real liquid. Polyju", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "TDJHPVDZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195808+00:00", - "__module__": "datetime" - }, - "trace_id": "r2GKj8iqTYaNxTeq", - "type": "metric", - "unit": "tokens", - "value": 28 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "TDJHPVDZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195814+00:00", - "__module__": "datetime" - }, - "trace_id": "r2GKj8iqTYaNxTeq", - "type": "metric", - "unit": "tokens", - "value": 183 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "ice is a fictional substance from the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -28345,7 +28112,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": " Harry Potter series. The boiling point", "type": "text" }, "event_type": { @@ -28365,7 +28132,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": " of a substance is a physical", "type": "text" }, "event_type": { @@ -28385,7 +28152,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": " property that can be measured and", "type": "text" }, "event_type": { @@ -28405,20 +28172,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "135d468e-6391-401d-a3c0-3b08c3a6eb8c", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " quantified", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -28426,11 +28181,47 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", but it only applies", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to real substances that exist in the physical world.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -28463,16 +28254,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "8pZtsyNW", + "span_id": "y9SHtJTQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321089+00:00", + "__datetime__": "2025-03-07T02:05:01.411612+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "_I2Cu85IRtOSBSX9", "type": "metric", "unit": "tokens", - "value": 99 + "value": 84 }, { "attributes": { @@ -28480,16 +28271,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "8pZtsyNW", + "span_id": "y9SHtJTQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321130+00:00", + "__datetime__": "2025-03-07T02:05:01.411644+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "_I2Cu85IRtOSBSX9", "type": "metric", "unit": "tokens", - "value": 28 + "value": 73 }, { "attributes": { @@ -28497,16 +28288,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "8pZtsyNW", + "span_id": "y9SHtJTQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321140+00:00", + "__datetime__": "2025-03-07T02:05:01.411650+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "_I2Cu85IRtOSBSX9", "type": "metric", "unit": "tokens", - "value": 127 + "value": 157 } ] } @@ -28514,7 +28305,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28542,7 +28333,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -28562,7 +28353,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -28582,7 +28373,7 @@ "data": { "event": { "delta": { - "text": "(liquid_name='polyju", + "text": " recognized.", "type": "text" }, "event_type": { @@ -28602,89 +28393,13 @@ "data": { "event": { "delta": { - "text": "ice', celcius=True", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "b30bc9bd-2ba2-4016-a319-a5321c217282", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "complete" }, "logprobs": null, "stop_reason": { @@ -28700,16 +28415,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "XJHIgX_A", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:37.978838+00:00", + "__datetime__": "2025-03-07T01:45:55.401637+00:00", "__module__": "datetime" }, - "trace_id": "FvxBc5KZSX2OJ-XT", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 43 + "value": 93 }, { "attributes": { @@ -28717,16 +28432,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "XJHIgX_A", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:37.978902+00:00", + "__datetime__": "2025-03-07T01:45:55.401666+00:00", "__module__": "datetime" }, - "trace_id": "FvxBc5KZSX2OJ-XT", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 28 + "value": 20 }, { "attributes": { @@ -28734,16 +28449,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "XJHIgX_A", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:37.978910+00:00", + "__datetime__": "2025-03-07T01:45:55.401670+00:00", "__module__": "datetime" }, - "trace_id": "FvxBc5KZSX2OJ-XT", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 71 + "value": 113 } ] } @@ -28751,7 +28466,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28799,7 +28514,67 @@ "data": { "event": { "delta": { - "text": " function call returned an", + "text": " function get_bo", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "iling_point_with_metadata does not exist,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " I will", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " assume you", "type": "text" }, "event_type": { @@ -28819,7 +28594,7 @@ "data": { "event": { "delta": { - "text": " error", + "text": " meant get_bo", "type": "text" }, "event_type": { @@ -28839,7 +28614,7 @@ "data": { "event": { "delta": { - "text": ", poly", + "text": "iling_point_with_metadata", "type": "text" }, "event_type": { @@ -28859,7 +28634,7 @@ "data": { "event": { "delta": { - "text": "juice is not", + "text": ". The boiling point of polyjuice", "type": "text" }, "event_type": { @@ -28879,7 +28654,7 @@ "data": { "event": { "delta": { - "text": " a real liquid.", + "text": " is -100.", "type": "text" }, "event_type": { @@ -28921,16 +28696,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "i15F3AnP", + "span_id": "8dM6i5mO", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:16.174015+00:00", + "__datetime__": "2025-03-07T02:05:03.329281+00:00", "__module__": "datetime" }, - "trace_id": "0IGoDzUNTxC53bAN", + "trace_id": "zMJDP5dXRrChi7uE", "type": "metric", "unit": "tokens", - "value": 84 + "value": 86 }, { "attributes": { @@ -28938,16 +28713,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "i15F3AnP", + "span_id": "8dM6i5mO", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:16.174351+00:00", + "__datetime__": "2025-03-07T02:05:03.329312+00:00", "__module__": "datetime" }, - "trace_id": "0IGoDzUNTxC53bAN", + "trace_id": "zMJDP5dXRrChi7uE", "type": "metric", "unit": "tokens", - "value": 26 + "value": 45 }, { "attributes": { @@ -28955,16 +28730,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "i15F3AnP", + "span_id": "8dM6i5mO", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:16.174361+00:00", + "__datetime__": "2025-03-07T02:05:03.329318+00:00", "__module__": "datetime" }, - "trace_id": "0IGoDzUNTxC53bAN", + "trace_id": "zMJDP5dXRrChi7uE", "type": "metric", "unit": "tokens", - "value": 110 + "value": 131 } ] } @@ -28972,7 +28747,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29020,7 +28795,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": " function get_boiling_point_with_metadata(", "type": "text" }, "event_type": { @@ -29040,7 +28815,27 @@ "data": { "event": { "delta": { - "text": " recognized.", + "text": "liquid_name=\"polyjuice\", celcius=True) should be", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " used to get the answer.", "type": "text" }, "event_type": { @@ -29082,16 +28877,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401637+00:00", + "__datetime__": "2025-03-07T01:45:56.809816+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 93 + "value": 97 }, { "attributes": { @@ -29099,16 +28894,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401666+00:00", + "__datetime__": "2025-03-07T01:45:56.809911+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 20 + "value": 39 }, { "attributes": { @@ -29116,16 +28911,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401670+00:00", + "__datetime__": "2025-03-07T01:45:56.809922+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 113 + "value": 136 } ] } @@ -29133,7 +28928,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29161,27 +28956,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " function call should have", + "text": "[", "type": "text" }, "event_type": { @@ -29201,7 +28976,7 @@ "data": { "event": { "delta": { - "text": " been get", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -29221,7 +28996,7 @@ "data": { "event": { "delta": { - "text": "_boiling_point_with", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -29241,8 +29016,20 @@ "data": { "event": { "delta": { - "text": "_metadata, I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "328cb19d-47bb-47cc-8258-a5ca2e26803e", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29250,7 +29037,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -29261,100 +29052,20 @@ "data": { "event": { "delta": { - "text": " will", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " make sure to use", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the correct function name in", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the future.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": [ { @@ -29363,16 +29074,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "2UaLOS7T", + "span_id": "dS0bhfN_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:19.016688+00:00", + "__datetime__": "2025-03-07T02:04:53.324788+00:00", "__module__": "datetime" }, - "trace_id": "tm3A32woQsmtUmLd", + "trace_id": "UJz5Cas1SDyQYeBk", "type": "metric", "unit": "tokens", - "value": 86 + "value": 37 }, { "attributes": { @@ -29380,16 +29091,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "2UaLOS7T", + "span_id": "dS0bhfN_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:19.016723+00:00", + "__datetime__": "2025-03-07T02:04:53.324835+00:00", "__module__": "datetime" }, - "trace_id": "tm3A32woQsmtUmLd", + "trace_id": "UJz5Cas1SDyQYeBk", "type": "metric", "unit": "tokens", - "value": 37 + "value": 28 }, { "attributes": { @@ -29397,16 +29108,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "2UaLOS7T", + "span_id": "dS0bhfN_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:19.016730+00:00", + "__datetime__": "2025-03-07T02:04:53.324844+00:00", "__module__": "datetime" }, - "trace_id": "tm3A32woQsmtUmLd", + "trace_id": "UJz5Cas1SDyQYeBk", "type": "metric", "unit": "tokens", - "value": 123 + "value": 65 } ] } @@ -29414,7 +29125,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29442,7 +29153,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[", "type": "text" }, "event_type": { @@ -29462,7 +29173,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point_with_metadata(", + "text": "get_boiling_point_with_metadata", "type": "text" }, "event_type": { @@ -29482,7 +29193,7 @@ "data": { "event": { "delta": { - "text": "liquid_name=\"polyjuice\", celcius=True) should be", + "text": "(liquid_name='polyjuice', cel", "type": "text" }, "event_type": { @@ -29502,7 +29213,7 @@ "data": { "event": { "delta": { - "text": " used to get the answer.", + "text": "cius=True)]", "type": "text" }, "event_type": { @@ -29516,6 +29227,42 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "5bb48d00-7d5c-49e2-bddf-e5fdc5f35485", + "tool_name": "get_boiling_point_with_metadata" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -29544,16 +29291,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "pzQMKAJc", + "span_id": "mfrFN7m2", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809816+00:00", + "__datetime__": "2025-03-07T02:05:02.136501+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "T4eddr4-SMWPQwKA", "type": "metric", "unit": "tokens", - "value": 97 + "value": 37 }, { "attributes": { @@ -29561,16 +29308,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "pzQMKAJc", + "span_id": "mfrFN7m2", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809911+00:00", + "__datetime__": "2025-03-07T02:05:02.136529+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "T4eddr4-SMWPQwKA", "type": "metric", "unit": "tokens", - "value": 39 + "value": 30 }, { "attributes": { @@ -29578,16 +29325,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "pzQMKAJc", + "span_id": "mfrFN7m2", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809922+00:00", + "__datetime__": "2025-03-07T02:05:02.136535+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "T4eddr4-SMWPQwKA", "type": "metric", "unit": "tokens", - "value": 136 + "value": 67 } ] } @@ -29595,7 +29342,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29623,7 +29370,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "When", "type": "text" }, "event_type": { @@ -29643,7 +29390,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point", + "text": " I answered the", "type": "text" }, "event_type": { @@ -29663,7 +29410,7 @@ "data": { "event": { "delta": { - "text": "(liquid_name='polyju", + "text": " phone, the friendly", "type": "text" }, "event_type": { @@ -29683,7 +29430,7 @@ "data": { "event": { "delta": { - "text": "ice', celcius=True", + "text": " voice on the other end said \"hello\"", "type": "text" }, "event_type": { @@ -29703,7 +29450,7 @@ "data": { "event": { "delta": { - "text": ")]", + "text": " and asked how I was doing.", "type": "text" }, "event_type": { @@ -29717,42 +29464,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "31855436-909b-43d0-9247-05d1f329d2e9", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -29781,16 +29492,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "X6bzcHl6", + "span_id": "tJEuRhla", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:14.474786+00:00", + "__datetime__": "2025-03-07T01:44:01.044284+00:00", "__module__": "datetime" }, - "trace_id": "-KkgkeBSQBq56Y7A", + "trace_id": "bnDS7Z41TRO0UyfH", "type": "metric", "unit": "tokens", - "value": 37 + "value": 30 }, { "attributes": { @@ -29798,16 +29509,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "X6bzcHl6", + "span_id": "tJEuRhla", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:14.474813+00:00", + "__datetime__": "2025-03-07T01:44:01.044312+00:00", "__module__": "datetime" }, - "trace_id": "-KkgkeBSQBq56Y7A", + "trace_id": "bnDS7Z41TRO0UyfH", "type": "metric", "unit": "tokens", - "value": 28 + "value": 34 }, { "attributes": { @@ -29815,16 +29526,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "X6bzcHl6", + "span_id": "tJEuRhla", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:14.474820+00:00", + "__datetime__": "2025-03-07T01:44:01.044318+00:00", "__module__": "datetime" }, - "trace_id": "-KkgkeBSQBq56Y7A", + "trace_id": "bnDS7Z41TRO0UyfH", "type": "metric", "unit": "tokens", - "value": 65 + "value": 64 } ] } @@ -29832,7 +29543,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29860,7 +29571,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "I", "type": "text" }, "event_type": { @@ -29880,7 +29591,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point", + "text": " am not able", "type": "text" }, "event_type": { @@ -29900,7 +29611,7 @@ "data": { "event": { "delta": { - "text": "_with_metadata(liquid_name='", + "text": " to execute this task as", "type": "text" }, "event_type": { @@ -29920,7 +29631,7 @@ "data": { "event": { "delta": { - "text": "polyjuice', cel", + "text": " it exceeds the", "type": "text" }, "event_type": { @@ -29940,7 +29651,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": " limitations of the functions I", "type": "text" }, "event_type": { @@ -29960,20 +29671,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "85d67373-4070-43d2-a484-6d4a97c85e22", - "tool_name": "get_boiling_point_with_metadata" - }, - "type": "tool_call" + "text": " have been given.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29981,11 +29680,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -30018,16 +29713,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "bbYbxoH7", + "span_id": "5If5go-q", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:17.392364+00:00", + "__datetime__": "2025-03-07T01:45:48.070675+00:00", "__module__": "datetime" }, - "trace_id": "8q7Ao94XQM-Wh7uH", + "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 37 + "value": 433 }, { "attributes": { @@ -30035,16 +29730,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "bbYbxoH7", + "span_id": "5If5go-q", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:17.392459+00:00", + "__datetime__": "2025-03-07T01:45:48.070742+00:00", "__module__": "datetime" }, - "trace_id": "8q7Ao94XQM-Wh7uH", + "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 30 + "value": 31 }, { "attributes": { @@ -30052,16 +29747,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "bbYbxoH7", + "span_id": "5If5go-q", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:17.392474+00:00", + "__datetime__": "2025-03-07T01:45:48.070750+00:00", "__module__": "datetime" }, - "trace_id": "8q7Ao94XQM-Wh7uH", + "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 67 + "value": 464 } ] } @@ -30069,7 +29764,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -30097,8 +29792,13 @@ "data": { "event": { "delta": { - "text": "When", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30117,8 +29817,13 @@ "data": { "event": { "delta": { - "text": " I answered", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf =", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30137,28 +29842,13 @@ "data": { "event": { "delta": { - "text": " the phone,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the friendly voice", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " pd.read_csv(\"/var/folders/rb/qv8vwgyj", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30177,8 +29867,13 @@ "data": { "event": { "delta": { - "text": " on the other end", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "6yjd3t4pwsy9t0rm0000", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30197,8 +29892,13 @@ "data": { "event": { "delta": { - "text": " said \"", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T/tmp2x_sml66/ZEjbinQHin", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30217,8 +29917,13 @@ "data": { "event": { "delta": { - "text": "hello\" and asked how I was doing.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.csv\")\n# Rows\nprint(\"Number of rows and columns in the", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30237,109 +29942,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "oVSdVF2W", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:15:58.571786+00:00", - "__module__": "datetime" - }, - "trace_id": "tPpkfz4pQZ2NBT2q", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "oVSdVF2W", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:15:58.571835+00:00", - "__module__": "datetime" - }, - "trace_id": "tPpkfz4pQZ2NBT2q", - "type": "metric", - "unit": "tokens", - "value": 34 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "oVSdVF2W", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:15:58.571842+00:00", - "__module__": "datetime" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" }, - "trace_id": "tPpkfz4pQZ2NBT2q", - "type": "metric", - "unit": "tokens", - "value": 64 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" + "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30358,8 +29967,13 @@ "data": { "event": { "delta": { - "text": " am not able", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30378,8 +29992,13 @@ "data": { "event": { "delta": { - "text": " to execute this task as", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30398,8 +30017,13 @@ "data": { "event": { "delta": { - "text": " it exceeds the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30418,8 +30042,13 @@ "data": { "event": { "delta": { - "text": " limitations of the functions I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(df.head())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30438,8 +30067,23 @@ "data": { "event": { "delta": { - "text": " have been given.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "1df8b196-9eff-4b06-97e7-ab175c741e8f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30447,7 +30091,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -30480,16 +30128,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "5If5go-q", + "span_id": "fLqIbpek", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__datetime__": "2025-03-07T01:45:40.262304+00:00", "__module__": "datetime" }, "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 433 + "value": 235 }, { "attributes": { @@ -30497,16 +30145,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "5If5go-q", + "span_id": "fLqIbpek", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__datetime__": "2025-03-07T01:45:40.262340+00:00", "__module__": "datetime" }, "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 31 + "value": 10 }, { "attributes": { @@ -30514,16 +30162,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "5If5go-q", + "span_id": "fLqIbpek", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__datetime__": "2025-03-07T01:45:40.262347+00:00", "__module__": "datetime" }, "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 464 + "value": 245 } ] } @@ -30531,7 +30179,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -30559,8 +30207,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30579,8 +30232,13 @@ "data": { "event": { "delta": { - "text": " error message indicates that", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf = pd", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30599,8 +30257,13 @@ "data": { "event": { "delta": { - "text": " the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30619,8 +30282,13 @@ "data": { "event": { "delta": { - "text": " 'bwrap'", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "pwsy9t0rm0000gn/T/tmp2x_sml66/ZEj", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30639,8 +30307,13 @@ "data": { "event": { "delta": { - "text": " was not found.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "binQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30659,8 +30332,13 @@ "data": { "event": { "delta": { - "text": " This is likely because", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n#", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30679,11 +30357,16 @@ "data": { "event": { "delta": { - "text": " the file path provided", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", "value": "progress" }, @@ -30699,8 +30382,13 @@ "data": { "event": { "delta": { - "text": " is incorrect or the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Datatype of the columns are:\", df.dtypes)\n# Sample", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30719,8 +30407,13 @@ "data": { "event": { "delta": { - "text": " file does not exist", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of data\nprint(\"Data sample from file:\")\nprint(df.head())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30739,8 +30432,23 @@ "data": { "event": { "delta": { - "text": " in the specified location", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "c1708ded-f272-4008-b91f-19d61780c394", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30748,7 +30456,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -30759,33 +30471,94 @@ "data": { "event": { "delta": { - "text": ".\n\nTo resolve this", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305765+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 37 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305820+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305832+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 47 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " issue, you should", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -30799,7 +30572,7 @@ "data": { "event": { "delta": { - "text": " ensure that", + "text": "The", "type": "text" }, "event_type": { @@ -30819,7 +30592,7 @@ "data": { "event": { "delta": { - "text": " the file path is", + "text": " error message indicates that the file \"/var/folders/rb/qv8", "type": "text" }, "event_type": { @@ -30839,7 +30612,7 @@ "data": { "event": { "delta": { - "text": " correct and the file", + "text": "vwgyj6yjd3t4pwsy9t0", "type": "text" }, "event_type": { @@ -30859,7 +30632,7 @@ "data": { "event": { "delta": { - "text": " exists in the specified", + "text": "rm0000gn/T/tmp2x_sml66/9vY", "type": "text" }, "event_type": { @@ -30879,7 +30652,7 @@ "data": { "event": { "delta": { - "text": " location. If the", + "text": "vmVRoinflation.csv\" does not exist. This could be due to", "type": "text" }, "event_type": { @@ -30899,7 +30672,7 @@ "data": { "event": { "delta": { - "text": " file is located in", + "text": " a variety of reasons such as the file being deleted, the path being incorrect", "type": "text" }, "event_type": { @@ -30919,7 +30692,7 @@ "data": { "event": { "delta": { - "text": " a different directory,", + "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", "type": "text" }, "event_type": { @@ -30939,7 +30712,7 @@ "data": { "event": { "delta": { - "text": " you should provide the", + "text": " try the following:\n\n1. Check the file path: Ensure that the file", "type": "text" }, "event_type": { @@ -30959,7 +30732,7 @@ "data": { "event": { "delta": { - "text": " correct path to", + "text": " path is correct and the file exists at that location.\n2. Check file permissions:", "type": "text" }, "event_type": { @@ -30979,7 +30752,7 @@ "data": { "event": { "delta": { - "text": " the file.\n\n", + "text": " Ensure that the file is accessible and you have the necessary permissions to", "type": "text" }, "event_type": { @@ -30999,7 +30772,7 @@ "data": { "event": { "delta": { - "text": "Additionally,", + "text": " read it.\n3. Try a different file: If the file is not", "type": "text" }, "event_type": { @@ -31019,7 +30792,7 @@ "data": { "event": { "delta": { - "text": " you can use the", + "text": " accessible, try loading a different file to see if the issue is specific to", "type": "text" }, "event_type": { @@ -31039,7 +30812,7 @@ "data": { "event": { "delta": { - "text": " `os` module", + "text": " this file or a general issue with your code.\n4. Check for ty", "type": "text" }, "event_type": { @@ -31059,7 +30832,7 @@ "data": { "event": { "delta": { - "text": " to check if the", + "text": "pos: Ensure that there are no typos in the file path or the", "type": "text" }, "event_type": { @@ -31079,7 +30852,7 @@ "data": { "event": { "delta": { - "text": " file exists before attempting", + "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", "type": "text" }, "event_type": { @@ -31099,7 +30872,7 @@ "data": { "event": { "delta": { - "text": " to read it", + "text": " you are using, and I'll be happy to help further.", "type": "text" }, "event_type": { @@ -31119,33 +30892,94 @@ "data": { "event": { "delta": { - "text": ". Here's an example", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262530+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 680 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262555+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 238 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262558+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 918 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": ":\n\n```python", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31159,8 +30993,13 @@ "data": { "event": { "delta": { - "text": "\nimport os\nimport", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31179,8 +31018,13 @@ "data": { "event": { "delta": { - "text": " pandas as pd\n\nfile_path", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31199,8 +31043,13 @@ "data": { "event": { "delta": { - "text": " = \"/var/folders/r", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31219,8 +31068,13 @@ "data": { "event": { "delta": { - "text": "b/qv8vwgy", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6yjd3t4pwsy9t", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31239,8 +31093,13 @@ "data": { "event": { "delta": { - "text": "j6yjd3t", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31259,8 +31118,13 @@ "data": { "event": { "delta": { - "text": "4pwsy9t", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31279,14 +31143,19 @@ "data": { "event": { "delta": { - "text": "0rm0000gn/T", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, "logprobs": null, "stop_reason": null }, @@ -31299,8 +31168,13 @@ "data": { "event": { "delta": { - "text": "/tmpjdr", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'], format='%Y')\n\n# Group by", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31319,8 +31193,13 @@ "data": { "event": { "delta": { - "text": "nryox/gEWH", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31339,8 +31218,13 @@ "data": { "event": { "delta": { - "text": "hdSVin", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31359,8 +31243,13 @@ "data": { "event": { "delta": { - "text": "flation.csv\"\n\nif", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation as a time series\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31379,8 +31268,13 @@ "data": { "event": { "delta": { - "text": " os.path.isfile(file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31399,8 +31293,13 @@ "data": { "event": { "delta": { - "text": "_path):\n ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df_avg_in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31419,8 +31318,13 @@ "data": { "event": { "delta": { - "text": " df = pd.read", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Inflation'], marker='o')\nplt", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31439,8 +31343,13 @@ "data": { "event": { "delta": { - "text": "_csv(file_path)\n", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31459,8 +31368,48 @@ "data": { "event": { "delta": { - "text": " print(\"Number", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31468,6 +31417,111 @@ "value": "progress" }, "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953806+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 432 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953843+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953847+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 442 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, "stop_reason": null }, "metrics": null @@ -31479,7 +31533,7 @@ "data": { "event": { "delta": { - "text": " of rows and columns", + "text": "The", "type": "text" }, "event_type": { @@ -31499,7 +31553,7 @@ "data": { "event": { "delta": { - "text": " in the data", + "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", "type": "text" }, "event_type": { @@ -31519,7 +31573,7 @@ "data": { "event": { "delta": { - "text": ":\", df.shape)\n print", + "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", "type": "text" }, "event_type": { @@ -31539,7 +31593,7 @@ "data": { "event": { "delta": { - "text": "(\"Columns of the data are", + "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", "type": "text" }, "event_type": { @@ -31559,7 +31613,7 @@ "data": { "event": { "delta": { - "text": ":\", len(df", + "text": " due to a variety of", "type": "text" }, "event_type": { @@ -31579,7 +31633,7 @@ "data": { "event": { "delta": { - "text": ".columns))\n print(\"Columns", + "text": " reasons such as the file", "type": "text" }, "event_type": { @@ -31599,7 +31653,7 @@ "data": { "event": { "delta": { - "text": " of the data are:\", df", + "text": " being deleted, the path being incorrect, or the file", "type": "text" }, "event_type": { @@ -31619,7 +31673,7 @@ "data": { "event": { "delta": { - "text": ".columns)\n print(\"Dat", + "text": " not being accessible.\n\nTo resolve this issue, you can try", "type": "text" }, "event_type": { @@ -31639,7 +31693,7 @@ "data": { "event": { "delta": { - "text": "atype of the columns are:\",", + "text": " the following:\n\n1. Check the file path: Ensure that", "type": "text" }, "event_type": { @@ -31659,7 +31713,7 @@ "data": { "event": { "delta": { - "text": " df.dtypes)\n print", + "text": " the file path is correct and the file exists at that", "type": "text" }, "event_type": { @@ -31679,7 +31733,7 @@ "data": { "event": { "delta": { - "text": "(\"Data sample from file:\")\n", + "text": " location.\n2. Check file permissions: Ensure that", "type": "text" }, "event_type": { @@ -31699,7 +31753,7 @@ "data": { "event": { "delta": { - "text": " print(df.head())\nelse", + "text": " the file is accessible and", "type": "text" }, "event_type": { @@ -31719,7 +31773,7 @@ "data": { "event": { "delta": { - "text": ":\n print(\"The file", + "text": " you have the necessary permissions to read", "type": "text" }, "event_type": { @@ -31739,7 +31793,7 @@ "data": { "event": { "delta": { - "text": " does not exist\")\n``", + "text": " it.\n3. Try a different file: If", "type": "text" }, "event_type": { @@ -31759,7 +31813,7 @@ "data": { "event": { "delta": { - "text": "`\n\nThis code checks if", + "text": " the file is not accessible, try loading a different file to see", "type": "text" }, "event_type": { @@ -31779,7 +31833,7 @@ "data": { "event": { "delta": { - "text": " the file exists before attempting", + "text": " if the issue is specific to this file or a general", "type": "text" }, "event_type": { @@ -31799,7 +31853,7 @@ "data": { "event": { "delta": { - "text": " to read it. If the", + "text": " issue with your code.\n", "type": "text" }, "event_type": { @@ -31819,7 +31873,7 @@ "data": { "event": { "delta": { - "text": " file does not exist, it", + "text": "4. Check for typos: Ensure that", "type": "text" }, "event_type": { @@ -31839,7 +31893,7 @@ "data": { "event": { "delta": { - "text": " prints a message indicating that the", + "text": " there are no typos in the file path or the code.\n\n", "type": "text" }, "event_type": { @@ -31859,7 +31913,7 @@ "data": { "event": { "delta": { - "text": " file does not exist.", + "text": "If you are", "type": "text" }, "event_type": { @@ -31879,38 +31933,98 @@ "data": { "event": { "delta": { - "text": "", + "text": " still having issues, please provide more details about", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file and the code you are using", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", and I'll be happy to help further.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ { "attributes": { "model_id": "meta-llama/Llama-3.3-70B-Instruct", "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "UF2BeSUk", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:58.914120+00:00", + "__datetime__": "2025-03-07T01:44:19.630894+00:00", "__module__": "datetime" }, - "trace_id": "PBS_ZwZnRYGrcPR-", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 234 + "value": 192 }, { "attributes": { @@ -31918,16 +32032,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "UF2BeSUk", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:58.914184+00:00", + "__datetime__": "2025-03-07T01:44:19.630987+00:00", "__module__": "datetime" }, - "trace_id": "PBS_ZwZnRYGrcPR-", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 302 + "value": 238 }, { "attributes": { @@ -31935,16 +32049,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "UF2BeSUk", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:58.914191+00:00", + "__datetime__": "2025-03-07T01:44:19.630996+00:00", "__module__": "datetime" }, - "trace_id": "PBS_ZwZnRYGrcPR-", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 536 + "value": 430 } ] } @@ -31952,7 +32066,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -32010,7 +32124,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import pandas as pd\n#", + "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", "type": "tool_call" }, "event_type": { @@ -32035,7 +32149,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " Load data\ndf", + "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", "type": "tool_call" }, "event_type": { @@ -32060,7 +32174,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " = pd.read_csv(\"/var", + "tool_call": "jd3t4pwsy9t0rm0000gn/T", "type": "tool_call" }, "event_type": { @@ -32085,7 +32199,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "/folders/rb/qv", + "tool_call": "/tmp2x_sml66/9vYvmVRoinflation.csv", "type": "tool_call" }, "event_type": { @@ -32110,7 +32224,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "8vwgyj6y", + "tool_call": "\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n", "type": "tool_call" }, "event_type": { @@ -32135,7 +32249,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "jd3t4pws", + "tool_call": "# Print information about the dataframe\nprint(df", "type": "tool_call" }, "event_type": { @@ -32160,7 +32274,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "y9t0rm000", + "tool_call": ".info())\n\n# Print summary statistics about the dataframe\nprint(df.describe", "type": "tool_call" }, "event_type": { @@ -32185,7 +32299,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "0gn/T/tmpjdr", + "tool_call": "())", "type": "tool_call" }, "event_type": { @@ -32208,9 +32322,19 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())" + }, + "call_id": "5bbfebeb-4360-4ef9-a9e2-4227a8e8c699", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } }, - "tool_call": "nryox/gEWH", "type": "tool_call" }, "event_type": { @@ -32219,7 +32343,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -32230,43 +32358,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "hdSVinflation.csv\")\n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "AyEX3So6", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:17.873486+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 36 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "AyEX3So6", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:17.873500+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "AyEX3So6", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:17.873503+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 46 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -32895,27 +33040,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", + "text": "[k", "type": "text" }, "event_type": { @@ -32935,7 +33060,7 @@ "data": { "event": { "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", + "text": "nowledge_search(query=\"using LoRA in Torchtune", "type": "text" }, "event_type": { @@ -32955,7 +33080,7 @@ "data": { "event": { "delta": { - "text": "rm0000gn/T/tmp2x_sml66/9vY", + "text": "\")]", "type": "text" }, "event_type": { @@ -32975,8 +33100,19 @@ "data": { "event": { "delta": { - "text": "vmVRoinflation.csv\" does not exist. This could be due to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32984,7 +33120,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -32995,11540 +33135,20 @@ "data": { "event": { "delta": { - "text": " a variety of reasons such as the file being deleted, the path being incorrect", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " try the following:\n\n1. Check the file path: Ensure that the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " path is correct and the file exists at that location.\n2. Check file permissions:", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Ensure that the file is accessible and you have the necessary permissions to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " read it.\n3. Try a different file: If the file is not", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " accessible, try loading a different file to see if the issue is specific to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " this file or a general issue with your code.\n4. Check for ty", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "pos: Ensure that there are no typos in the file path or the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " you are using, and I'll be happy to help further.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262530+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 680 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262555+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 238 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262558+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 918 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "8vwgyj6yjd3t4pwsy9t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'], format='%Y')\n\n# Group by", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation as a time series\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'], df_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['Inflation'], marker='o')\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953806+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 432 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953843+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 10 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953847+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 442 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, the error message mentions \\\"bwrap\\\" which is a command-line tool for running programs in a sandboxed environment. It's possible that the issue is related to the environment in which the code is being run, rather than the code itself.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " \"/var/folders/rb", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "/qv8vwgyj", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "6yjd3t4", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "pwsy9t0", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "rm0000gn/T/tmp", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "jdrnryox/", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "6q7CwY", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "m0inflation.csv\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " does not exist. This could", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " be due to a number of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " reasons such as the file being", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " deleted, the path being incorrect", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", or the file not being", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " accessible.\n\nTo resolve this issue", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", you should ensure that the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file exists and the path is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " correct. If the file does", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " not exist, you will need", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to create it or obtain it", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " from the relevant source. If", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the path is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " incorrect, you will need to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " update the path to the correct", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " location of the file.\n\nAdditionally", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", the error message mentions \"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "bwrap\" which is a", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " command-line tool", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " for running programs in a sandbox", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "ed environment. It's possible", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " that the issue is related to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the environment in which the code", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is being run, rather than", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the code itself.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "gOIUTtiI", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:31.326082+00:00", - "__module__": "datetime" - }, - "trace_id": "yeHsGm3mQxqHTxdk", - "type": "metric", - "unit": "tokens", - "value": 655 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "gOIUTtiI", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:31.326098+00:00", - "__module__": "datetime" - }, - "trace_id": "yeHsGm3mQxqHTxdk", - "type": "metric", - "unit": "tokens", - "value": 207 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "gOIUTtiI", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:31.326100+00:00", - "__module__": "datetime" - }, - "trace_id": "yeHsGm3mQxqHTxdk", - "type": "metric", - "unit": "tokens", - "value": 862 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you should ensure that the file exists and the path is correct. If the file does not exist, you will need to create it or obtain it from the relevant source. If the path is incorrect, you will need to update the path to the correct location of the file.\\n\\nAdditionally, the error message mentions \\\"bwrap\\\" which is a command-line tool for running programs in a sandboxed environment. It's possible that the issue is related to the environment in which the code is being run, rather than the code itself.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " matplotlib.pyplot as", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " plt\n\n# Load", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the CSV file\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "df = pd.read", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_csv(\"/var/f", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "olders/rb/q", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "v8vwgy", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "j6yjd", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "3t4p", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "wsy9t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm0000", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gn/T/tmpj", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "drnryox", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/6q7", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "CwYm0", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "inflation.csv\")\n\n# Convert", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the 'Year' column", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " to datetime\ndf", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "['Year'] =", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " pd.to_datetime(df", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "['Year'], format", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "='%Y')\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Group by", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year'", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " and calculate the average inflation\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "df_avg_inflation = df", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".groupby('Year')['Inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'].mean().reset_index()\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " as a time", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " series\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".figure(figsize=(10,6", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "))\nplt.plot(df_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['Year'], df_avg", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_inflation['Inflation'],", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " marker='o')\nplt.title", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Average Year", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "ly Inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "')\nplt.xlabel('Year')\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.ylabel('Inflation')\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.grid(True)\nplt.show", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "()", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpjdrnryox/6q7CwYm0inflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "f39928f4-df63-46b1-9ab7-269f5d80df83", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "SPVD5n3Z", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:28.011354+00:00", - "__module__": "datetime" - }, - "trace_id": "yeHsGm3mQxqHTxdk", - "type": "metric", - "unit": "tokens", - "value": 404 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "SPVD5n3Z", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:28.011382+00:00", - "__module__": "datetime" - }, - "trace_id": "yeHsGm3mQxqHTxdk", - "type": "metric", - "unit": "tokens", - "value": 10 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "SPVD5n3Z", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:28.011389+00:00", - "__module__": "datetime" - }, - "trace_id": "yeHsGm3mQxqHTxdk", - "type": "metric", - "unit": "tokens", - "value": 414 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " due to a variety of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " reasons such as the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " being deleted, the path being incorrect, or the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " not being accessible.\n\nTo resolve this issue, you can try", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the following:\n\n1. Check the file path: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file path is correct and the file exists at that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " location.\n2. Check file permissions: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file is accessible and", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " you have the necessary permissions to read", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " it.\n3. Try a different file: If", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file is not accessible, try loading a different file to see", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " if the issue is specific to this file or a general", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " issue with your code.\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "4. Check for typos: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " there are no typos in the file path or the code.\n\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "If you are", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " still having issues, please provide more details about", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file and the code you are using", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", and I'll be happy to help further.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630894+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 192 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630987+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 238 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630996+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 430 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " \"/var/folders/rb", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "/qv8vwgyj", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "6yjd3t4", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "pwsy9t0", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "rm0000gn/T/tmp", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "jdrnryox/", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "6q7CwY", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "m0in", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "flation.csv\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " does not exist. This could", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " be due to a", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " number of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " reasons such as", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file being deleted, the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " path being", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " incorrect, or the file not", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " being accessible.\n\nTo", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " resolve this", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " issue, you should ensure", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " that the file exists", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " and the path is correct.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " If the file does not", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " exist, you will need", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to create it or obtain it", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " from the relevant", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " source. If", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the path is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " incorrect, you", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " will need to update the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " path to the correct", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " location of the file.\n\nAdditionally", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", the error", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " message mentions \"bwrap\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " which is a command", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "-line", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " tool", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " for running programs", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " in a sandboxed", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " environment. It's", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " possible that the issue is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " related to the environment", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " in", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " which the code is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " being run, rather", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " than", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the code itself.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "lgPGJmgn", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:22.739081+00:00", - "__module__": "datetime" - }, - "trace_id": "H5eQcwg3S5yEsFZA", - "type": "metric", - "unit": "tokens", - "value": 195 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "lgPGJmgn", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:22.739118+00:00", - "__module__": "datetime" - }, - "trace_id": "H5eQcwg3S5yEsFZA", - "type": "metric", - "unit": "tokens", - "value": 207 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "lgPGJmgn", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:22.739122+00:00", - "__module__": "datetime" - }, - "trace_id": "H5eQcwg3S5yEsFZA", - "type": "metric", - "unit": "tokens", - "value": 402 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Load the CSV file\ndf", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " = pd.read_csv(\"/var", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/folders/rb/qv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "8vwgyj6y", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "jd3t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "4pwsy9t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm0000gn/T", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmpjdrnryox", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/6q7Cw", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Ym0inflation.csv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\")\n\n# Print the first few", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " rows of the dataframe\nprint", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(df.head())\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Print information about the dataframe\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(df.info())\n\n# Print", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " summary statistics of the", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " dataframe\nprint(df.describe())", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpjdrnryox/6q7CwYm0inflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics of the dataframe\nprint(df.describe())" - }, - "call_id": "104a058f-5fa5-4861-a2f4-28e09bf1dfbc", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qgUmXXsV", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:14.519707+00:00", - "__module__": "datetime" - }, - "trace_id": "H5eQcwg3S5yEsFZA", - "type": "metric", - "unit": "tokens", - "value": 36 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "qgUmXXsV", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:14.519781+00:00", - "__module__": "datetime" - }, - "trace_id": "H5eQcwg3S5yEsFZA", - "type": "metric", - "unit": "tokens", - "value": 10 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "qgUmXXsV", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:14.519787+00:00", - "__module__": "datetime" - }, - "trace_id": "H5eQcwg3S5yEsFZA", - "type": "metric", - "unit": "tokens", - "value": 46 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673350+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 107 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673375+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673381+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 130 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready to help. What's", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " your question about Torchtune?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179269+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179301+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 25 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179308+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 100 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7f524\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c553d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:4bcdb\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7f524\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c553d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "using LoRA in Tor", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "chtune\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "bbfbe149-e78a-4ec1-9cb9-37f47b482d31", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "6F-9YFWm", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:40.613104+00:00", - "__module__": "datetime" - }, - "trace_id": "gDuozGbVSrmg-3Tl", - "type": "metric", - "unit": "tokens", - "value": 108 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "6F-9YFWm", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:40.613124+00:00", - "__module__": "datetime" - }, - "trace_id": "gDuozGbVSrmg-3Tl", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "6F-9YFWm", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:40.613129+00:00", - "__module__": "datetime" - }, - "trace_id": "gDuozGbVSrmg-3Tl", - "type": "metric", - "unit": "tokens", - "value": 131 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7f524\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4bcdb\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c553d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to help.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " What's your first", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " question about Torcht", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "une?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "dVzcpbWR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:38.650749+00:00", - "__module__": "datetime" - }, - "trace_id": "sElmvWPvRneQHEaY", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "dVzcpbWR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:38.650820+00:00", - "__module__": "datetime" - }, - "trace_id": "sElmvWPvRneQHEaY", - "type": "metric", - "unit": "tokens", - "value": 26 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "dVzcpbWR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:38.650834+00:00", - "__module__": "datetime" - }, - "trace_id": "sElmvWPvRneQHEaY", - "type": "metric", - "unit": "tokens", - "value": 101 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209198+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 108 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209239+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209247+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 131 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready to help. What's", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " your first question about Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525734+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525763+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 26 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525770+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 101 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9cb06\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:48279\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ed09a\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9cb06\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:48279\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "=\"using Lo", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "RA in Torcht", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "une\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "069baa18-792f-4268-bbd9-65499b6ca253", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "EhlYWsJp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:28.344753+00:00", - "__module__": "datetime" - }, - "trace_id": "Uom62a7_SM2JU4Mp", - "type": "metric", - "unit": "tokens", - "value": 107 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "EhlYWsJp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:28.344791+00:00", - "__module__": "datetime" - }, - "trace_id": "Uom62a7_SM2JU4Mp", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "EhlYWsJp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:28.344798+00:00", - "__module__": "datetime" - }, - "trace_id": "Uom62a7_SM2JU4Mp", - "type": "metric", - "unit": "tokens", - "value": 130 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9cb06\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ed09a\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:48279\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to help.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " What", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'s your question about", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "aWF3H1iZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:24.710775+00:00", - "__module__": "datetime" - }, - "trace_id": "9fRx-MuMQbieYC8_", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "aWF3H1iZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:24.710874+00:00", - "__module__": "datetime" - }, - "trace_id": "9fRx-MuMQbieYC8_", - "type": "metric", - "unit": "tokens", - "value": 25 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "aWF3H1iZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:24.710888+00:00", - "__module__": "datetime" - }, - "trace_id": "9fRx-MuMQbieYC8_", - "type": "metric", - "unit": "tokens", - "value": 100 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Torchtune documentation", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Torchtune documentation" - }, - "call_id": "7ca12bd0-f629-4f23-9b14-a6f277b28a81", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "o5JuB0Ip", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:37.465863+00:00", - "__module__": "datetime" - }, - "trace_id": "sElmvWPvRneQHEaY", - "type": "metric", - "unit": "tokens", - "value": 39 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "o5JuB0Ip", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:37.465891+00:00", - "__module__": "datetime" - }, - "trace_id": "sElmvWPvRneQHEaY", - "type": "metric", - "unit": "tokens", - "value": 20 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "o5JuB0Ip", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:37.465897+00:00", - "__module__": "datetime" - }, - "trace_id": "sElmvWPvRneQHEaY", - "type": "metric", - "unit": "tokens", - "value": 59 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "L", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "lama3-8", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "B uses grouped", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "-query attention instead of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the standard", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " multi-head attention", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "TS8BB6CQ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:20.390405+00:00", - "__module__": "datetime" - }, - "trace_id": "XVVAGxGOTOqhq9V1", - "type": "metric", - "unit": "tokens", - "value": 80 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "TS8BB6CQ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:20.390438+00:00", - "__module__": "datetime" - }, - "trace_id": "XVVAGxGOTOqhq9V1", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "TS8BB6CQ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:20.390443+00:00", - "__module__": "datetime" - }, - "trace_id": "XVVAGxGOTOqhq9V1", - "type": "metric", - "unit": "tokens", - "value": 108 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "L", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "lama3-8", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "B uses grouped-query", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " attention instead of the standard", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " multi-head attention.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "NLFDWegH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:16.053013+00:00", - "__module__": "datetime" - }, - "trace_id": "YuGKHtZmRseP3fC4", - "type": "metric", - "unit": "tokens", - "value": 80 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "NLFDWegH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:16.053042+00:00", - "__module__": "datetime" - }, - "trace_id": "YuGKHtZmRseP3fC4", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "NLFDWegH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:16.053045+00:00", - "__module__": "datetime" - }, - "trace_id": "YuGKHtZmRseP3fC4", - "type": "metric", - "unit": "tokens", - "value": 108 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Llama3-8B", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " attention type\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "f855c399-8087-4d70-b315-cbcdfc2e7c64", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "n9XTUtxe", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:18.759878+00:00", - "__module__": "datetime" - }, - "trace_id": "XVVAGxGOTOqhq9V1", - "type": "metric", - "unit": "tokens", - "value": 40 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "n9XTUtxe", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:18.759959+00:00", - "__module__": "datetime" - }, - "trace_id": "XVVAGxGOTOqhq9V1", - "type": "metric", - "unit": "tokens", - "value": 24 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "n9XTUtxe", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:18.759970+00:00", - "__module__": "datetime" - }, - "trace_id": "XVVAGxGOTOqhq9V1", - "type": "metric", - "unit": "tokens", - "value": 64 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Llama3-", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "8B attention type\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "a975cf93-4809-4ca9-8a4b-c42e116d58d0", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": [ { @@ -44537,16 +33157,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "861X8wcF", + "span_id": "vGtNmXNY", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:14.638649+00:00", + "__datetime__": "2025-03-07T01:45:32.673350+00:00", "__module__": "datetime" }, - "trace_id": "YuGKHtZmRseP3fC4", + "trace_id": "8C2YTmRESTKZ0i1l", "type": "metric", "unit": "tokens", - "value": 40 + "value": 107 }, { "attributes": { @@ -44554,16 +33174,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "861X8wcF", + "span_id": "vGtNmXNY", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:14.638678+00:00", + "__datetime__": "2025-03-07T01:45:32.673375+00:00", "__module__": "datetime" }, - "trace_id": "YuGKHtZmRseP3fC4", + "trace_id": "8C2YTmRESTKZ0i1l", "type": "metric", "unit": "tokens", - "value": 24 + "value": 23 }, { "attributes": { @@ -44571,16 +33191,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "861X8wcF", + "span_id": "vGtNmXNY", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:14.638685+00:00", + "__datetime__": "2025-03-07T01:45:32.673381+00:00", "__module__": "datetime" }, - "trace_id": "YuGKHtZmRseP3fC4", + "trace_id": "8C2YTmRESTKZ0i1l", "type": "metric", "unit": "tokens", - "value": 64 + "value": 130 } ] } @@ -44588,7 +33208,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company.\\\", \\\"score\\\": 0.21026355, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -44616,7 +33236,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "I", "type": "text" }, "event_type": { @@ -44636,7 +33256,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark", + "text": "'m ready to help. What's", "type": "text" }, "event_type": { @@ -44656,7 +33276,7 @@ "data": { "event": { "delta": { - "text": " Zuckerberg.", + "text": " your question about Torchtune?", "type": "text" }, "event_type": { @@ -44698,16 +33318,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "GW7-tnGo", + "span_id": "7n3WMt3R", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:03.252268+00:00", + "__datetime__": "2025-03-07T01:45:31.179269+00:00", "__module__": "datetime" }, - "trace_id": "2TwB_v0KTZWN9Q_U", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 1079 + "value": 75 }, { "attributes": { @@ -44715,16 +33335,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "GW7-tnGo", + "span_id": "7n3WMt3R", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:03.252339+00:00", + "__datetime__": "2025-03-07T01:45:31.179301+00:00", "__module__": "datetime" }, - "trace_id": "2TwB_v0KTZWN9Q_U", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 19 + "value": 25 }, { "attributes": { @@ -44732,16 +33352,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "GW7-tnGo", + "span_id": "7n3WMt3R", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:03.252346+00:00", + "__datetime__": "2025-03-07T01:45:31.179308+00:00", "__module__": "datetime" }, - "trace_id": "2TwB_v0KTZWN9Q_U", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 1098 + "value": 100 } ] } @@ -44749,7 +33369,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "=[\"q_proj\", \"v_proj\"])`\n3. Load", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -45133,7 +33557,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " the pre-trained Llama2 weights into", "type": "text" }, "event_type": { @@ -45153,7 +33577,7 @@ "data": { "event": { "delta": { - "text": " boiling point", + "text": " the LoRA model: `lora_model.load_state", "type": "text" }, "event_type": { @@ -45173,7 +33597,7 @@ "data": { "event": { "delta": { - "text": " of polyjuice is -", + "text": "_dict(base_model.state_dict(), strict=False)`\n4. Set only Lo", "type": "text" }, "event_type": { @@ -45193,7 +33617,7 @@ "data": { "event": { "delta": { - "text": "100\u00b0C.", + "text": "RA parameters to trainable: `set_trainable_params(lora_model, get", "type": "text" }, "event_type": { @@ -45213,94 +33637,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "_adapter_params(lora_model))`\n5. Run the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "kHgxCJiI", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:01.920310+00:00", - "__module__": "datetime" - }, - "trace_id": "rkZwHytISlOlMq9O", - "type": "metric", - "unit": "tokens", - "value": 77 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "kHgxCJiI", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:01.920372+00:00", - "__module__": "datetime" - }, - "trace_id": "rkZwHytISlOlMq9O", - "type": "metric", - "unit": "tokens", - "value": 22 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "kHgxCJiI", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:01.920380+00:00", - "__module__": "datetime" - }, - "trace_id": "rkZwHytISlOlMq9O", - "type": "metric", - "unit": "tokens", - "value": 99 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " LoRA finetune using torch", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -45314,7 +33677,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "tune's LoRA", "type": "text" }, "event_type": { @@ -45334,7 +33697,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice", + "text": " recipe: `tune run --nnodes 1 --", "type": "text" }, "event_type": { @@ -45354,7 +33717,7 @@ "data": { "event": { "delta": { - "text": " is -100\u00b0C.", + "text": "nproc_per_node 2 lora_finetune_distributed --config", "type": "text" }, "event_type": { @@ -45374,94 +33737,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " llama2/7B_lora`\n\nYou can also experiment with different Lo", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "-RYggjae", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:35.674057+00:00", - "__module__": "datetime" - }, - "trace_id": "YT4rgoiUSCS70qhA", - "type": "metric", - "unit": "tokens", - "value": 77 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "-RYggjae", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:35.674090+00:00", - "__module__": "datetime" - }, - "trace_id": "YT4rgoiUSCS70qhA", - "type": "metric", - "unit": "tokens", - "value": 22 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "-RYggjae", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:35.674098+00:00", - "__module__": "datetime" - }, - "trace_id": "YT4rgoiUSCS70qhA", - "type": "metric", - "unit": "tokens", - "value": 99 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "RA configurations, such as applying LoRA to all linear layers", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -45475,7 +33777,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " in the self-attention, increasing the rank, and", "type": "text" }, "event_type": { @@ -45495,7 +33797,7 @@ "data": { "event": { "delta": { - "text": " function get_bo", + "text": " scaling alpha and rank together.\n\nNote: You need to", "type": "text" }, "event_type": { @@ -45515,7 +33817,7 @@ "data": { "event": { "delta": { - "text": "iling_point is not", + "text": " have the Llama2 weights and tokenizer downloaded and installed before running the", "type": "text" }, "event_type": { @@ -45535,7 +33837,7 @@ "data": { "event": { "delta": { - "text": " able to", + "text": " LoRA finetune. Additionally, you can use", "type": "text" }, "event_type": { @@ -45555,7 +33857,7 @@ "data": { "event": { "delta": { - "text": " find the boiling point of", + "text": " torchtune's `Wand", "type": "text" }, "event_type": { @@ -45575,7 +33877,7 @@ "data": { "event": { "delta": { - "text": " \"polyjuice\"", + "text": "BLogger` to generate loss curves and track your experiments", "type": "text" }, "event_type": { @@ -45595,7 +33897,7 @@ "data": { "event": { "delta": { - "text": " as it is not", + "text": ".", "type": "text" }, "event_type": { @@ -45615,13 +33917,94 @@ "data": { "event": { "delta": { - "text": " a real liquid.", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "IZ8Q_jX_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:28.484818+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 147 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "IZ8Q_jX_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:28.484914+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 290 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "IZ8Q_jX_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:28.484922+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 437 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -45635,7 +34018,7 @@ "data": { "event": { "delta": { - "text": " Polyjuice is a", + "text": "[k", "type": "text" }, "event_type": { @@ -45655,7 +34038,7 @@ "data": { "event": { "delta": { - "text": " fictional substance from the", + "text": "nowledge_search(query=\"using LoRA in Torchtune", "type": "text" }, "event_type": { @@ -45675,7 +34058,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series", + "text": "\")]", "type": "text" }, "event_type": { @@ -45695,8 +34078,19 @@ "data": { "event": { "delta": { - "text": ".", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -45704,7 +34098,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -45737,16 +34135,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "hcq3e4Mt", + "span_id": "qLPBZlok", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:48.744933+00:00", + "__datetime__": "2025-03-07T01:45:26.209198+00:00", "__module__": "datetime" }, - "trace_id": "f1UWdr3yT5CpYBm0", + "trace_id": "7GQeegpgTI-gqjHp", "type": "metric", "unit": "tokens", - "value": 77 + "value": 108 }, { "attributes": { @@ -45754,16 +34152,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "hcq3e4Mt", + "span_id": "qLPBZlok", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:48.744962+00:00", + "__datetime__": "2025-03-07T01:45:26.209239+00:00", "__module__": "datetime" }, - "trace_id": "f1UWdr3yT5CpYBm0", + "trace_id": "7GQeegpgTI-gqjHp", "type": "metric", "unit": "tokens", - "value": 51 + "value": 23 }, { "attributes": { @@ -45771,16 +34169,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "hcq3e4Mt", + "span_id": "qLPBZlok", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:48.744969+00:00", + "__datetime__": "2025-03-07T01:45:26.209247+00:00", "__module__": "datetime" }, - "trace_id": "f1UWdr3yT5CpYBm0", + "trace_id": "7GQeegpgTI-gqjHp", "type": "metric", "unit": "tokens", - "value": 128 + "value": 131 } ] } @@ -45788,7 +34186,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -45816,47 +34214,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " function call should be", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ":\n[", + "text": "I", "type": "text" }, "event_type": { @@ -45876,7 +34234,7 @@ "data": { "event": { "delta": { - "text": "get", + "text": "'m ready to help. What's", "type": "text" }, "event_type": { @@ -45896,7 +34254,7 @@ "data": { "event": { "delta": { - "text": "_boiling_point(liquid_name='polyjuice', celci", + "text": " your first question about Torchtune", "type": "text" }, "event_type": { @@ -45916,7 +34274,7 @@ "data": { "event": { "delta": { - "text": "us=True)]", + "text": "?", "type": "text" }, "event_type": { @@ -45958,16 +34316,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "JN7UZs_c", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473221+00:00", + "__datetime__": "2025-03-07T01:45:23.525734+00:00", "__module__": "datetime" }, - "trace_id": "H3r-_Zh-TVqtSp7k", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 86 + "value": 75 }, { "attributes": { @@ -45975,16 +34333,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "JN7UZs_c", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473254+00:00", + "__datetime__": "2025-03-07T01:45:23.525763+00:00", "__module__": "datetime" }, - "trace_id": "H3r-_Zh-TVqtSp7k", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 34 + "value": 26 }, { "attributes": { @@ -45992,16 +34350,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "JN7UZs_c", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473261+00:00", + "__datetime__": "2025-03-07T01:45:23.525770+00:00", "__module__": "datetime" }, - "trace_id": "H3r-_Zh-TVqtSp7k", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 120 + "value": 101 } ] } @@ -46009,7 +34367,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -46037,87 +34395,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " function `get_boiling_point`", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is not a real function and cannot be", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " used to determine the boiling point of polyju", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "ice. Polyjuice is a fictional substance from the", + "text": "[k", "type": "text" }, "event_type": { @@ -46137,7 +34415,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series and does not have a real-world boiling", + "text": "nowledge_search(query=\"Tor", "type": "text" }, "event_type": { @@ -46157,7 +34435,7 @@ "data": { "event": { "delta": { - "text": " point. If you have any other questions or need help", + "text": "chtune documentation\")]", "type": "text" }, "event_type": { @@ -46177,8 +34455,19 @@ "data": { "event": { "delta": { - "text": " with a different topic, feel free to ask!", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Torchtune documentation" + }, + "call_id": "385cbde8-19e8-4c8b-84ca-b75050b3666b", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -46186,7 +34475,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -46219,16 +34512,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "aCPTIc0d", + "span_id": "-7YS2sLl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227208+00:00", + "__datetime__": "2025-03-07T01:45:30.668846+00:00", "__module__": "datetime" }, - "trace_id": "4DRyVE86RpCeqfpE", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 86 + "value": 39 }, { "attributes": { @@ -46236,16 +34529,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "aCPTIc0d", + "span_id": "-7YS2sLl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227251+00:00", + "__datetime__": "2025-03-07T01:45:30.668859+00:00", "__module__": "datetime" }, - "trace_id": "4DRyVE86RpCeqfpE", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 78 + "value": 20 }, { "attributes": { @@ -46253,58 +34546,38 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "aCPTIc0d", + "span_id": "-7YS2sLl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227258+00:00", + "__datetime__": "2025-03-07T01:45:30.668861+00:00", "__module__": "datetime" }, - "trace_id": "4DRyVE86RpCeqfpE", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", - "unit": "tokens", - "value": 164 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + "unit": "tokens", + "value": 59 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "The", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -46318,7 +34591,7 @@ "data": { "event": { "delta": { - "text": " function call should be in the following format", + "text": "L", "type": "text" }, "event_type": { @@ -46338,7 +34611,7 @@ "data": { "event": { "delta": { - "text": ": [function_name(parameters)]. However", + "text": "lama3-8B uses grouped-query", "type": "text" }, "event_type": { @@ -46358,7 +34631,7 @@ "data": { "event": { "delta": { - "text": ", the function get_boiling_point is not recognized", + "text": " attention instead of", "type": "text" }, "event_type": { @@ -46378,7 +34651,7 @@ "data": { "event": { "delta": { - "text": ". If the function", + "text": " the standard multi-head attention.", "type": "text" }, "event_type": { @@ -46398,53 +34671,94 @@ "data": { "event": { "delta": { - "text": " is supposed to return the boiling point of a liquid, it should be defined", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " before it can be used. \n\nIn this", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "1eIEdjPP", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:18.982970+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 80 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "1eIEdjPP", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:18.983000+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "1eIEdjPP", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:18.983005+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " case, I will assume that the function get_boiling_point is defined as", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -46458,7 +34772,7 @@ "data": { "event": { "delta": { - "text": " follows:\ndef get", + "text": "L", "type": "text" }, "event_type": { @@ -46478,7 +34792,7 @@ "data": { "event": { "delta": { - "text": "_boiling_point(liquid_name, celcius=True):\n # This", + "text": "lama3-8B uses grouped-query attention instead of", "type": "text" }, "event_type": { @@ -46498,7 +34812,7 @@ "data": { "event": { "delta": { - "text": " function returns the", + "text": " the standard", "type": "text" }, "event_type": { @@ -46518,7 +34832,7 @@ "data": { "event": { "delta": { - "text": " boiling point of a liquid in Celcius or Fahrenheit\n boiling_points", + "text": " multi-head attention.", "type": "text" }, "event_type": { @@ -46538,33 +34852,94 @@ "data": { "event": { "delta": { - "text": " = {\n \"water\": 100,\n \"polyjuice\":", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884663+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 80 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884753+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884760+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " 120 # Assuming poly", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -46578,7 +34953,7 @@ "data": { "event": { "delta": { - "text": "juice has a boiling point of 120 degrees Cel", + "text": "[k", "type": "text" }, "event_type": { @@ -46598,7 +34973,7 @@ "data": { "event": { "delta": { - "text": "cius\n }\n if liquid", + "text": "nowledge_search(query=\"Llama3-8", "type": "text" }, "event_type": { @@ -46618,7 +34993,7 @@ "data": { "event": { "delta": { - "text": "_name in boiling_points:\n if celcius:\n return", + "text": "B attention type\")]", "type": "text" }, "event_type": { @@ -46638,8 +35013,19 @@ "data": { "event": { "delta": { - "text": " boiling_points[liquid_name]\n else:\n return boiling_points[liquid", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "4901bbdf-8faf-4a57-b6f6-01688c6290e6", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -46647,7 +35033,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -46658,33 +35048,94 @@ "data": { "event": { "delta": { - "text": "_name] * 9/5 + ", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412559+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 40 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412607+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 24 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412615+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 64 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "32\n else:\n return \"Boiling point not found", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -46698,7 +35149,7 @@ "data": { "event": { "delta": { - "text": "\"\n\nNow, the function call", + "text": "[k", "type": "text" }, "event_type": { @@ -46718,7 +35169,7 @@ "data": { "event": { "delta": { - "text": " should be: \n", + "text": "nowledge_search(query=\"Llama3-8B attention", "type": "text" }, "event_type": { @@ -46738,7 +35189,7 @@ "data": { "event": { "delta": { - "text": "[get_boiling_point(liquid_name=\"polyju", + "text": " type\")]", "type": "text" }, "event_type": { @@ -46758,8 +35209,19 @@ "data": { "event": { "delta": { - "text": "ice\", celcius=True)]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "dd056386-b105-47e5-bd85-07e5ae096de1", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -46767,7 +35229,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -46800,16 +35266,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "NnkGeCwM", + "span_id": "yjKrmpeo", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213901+00:00", + "__datetime__": "2025-03-07T01:45:12.041566+00:00", "__module__": "datetime" }, - "trace_id": "7ifSRjCjRIioDOte", + "trace_id": "liTx9auyTkyfvrBr", "type": "metric", "unit": "tokens", - "value": 86 + "value": 40 }, { "attributes": { @@ -46817,16 +35283,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "NnkGeCwM", + "span_id": "yjKrmpeo", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213925+00:00", + "__datetime__": "2025-03-07T01:45:12.041591+00:00", "__module__": "datetime" }, - "trace_id": "7ifSRjCjRIioDOte", + "trace_id": "liTx9auyTkyfvrBr", "type": "metric", "unit": "tokens", - "value": 234 + "value": 24 }, { "attributes": { @@ -46834,16 +35300,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "NnkGeCwM", + "span_id": "yjKrmpeo", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213931+00:00", + "__datetime__": "2025-03-07T01:45:12.041597+00:00", "__module__": "datetime" }, - "trace_id": "7ifSRjCjRIioDOte", + "trace_id": "liTx9auyTkyfvrBr", "type": "metric", "unit": "tokens", - "value": 320 + "value": 64 } ] } @@ -46851,7 +35317,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -46879,7 +35345,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -46899,7 +35365,7 @@ "data": { "event": { "delta": { - "text": "get_boiling", + "text": " current CEO of Meta is Mark Zuckerberg.", "type": "text" }, "event_type": { @@ -46919,33 +35385,94 @@ "data": { "event": { "delta": { - "text": "_point(liquid_name", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084924+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1145 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084934+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 19 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084936+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1164 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "='polyjuice", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -46959,8 +35486,13 @@ "data": { "event": { "delta": { - "text": "', celcius", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -46979,8 +35511,13 @@ "data": { "event": { "delta": { - "text": "=True)]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "brave_search.call(query=\"current CEO of Meta\")", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -47006,11 +35543,14 @@ }, "tool_call": { "arguments": { - "celcius": true, - "liquid_name": "polyjuice" + "query": "current CEO of Meta" }, - "call_id": "50ff6c7b-d098-4a3c-b299-117afe819175", - "tool_name": "get_boiling_point" + "call_id": "535c272b-768b-44fe-b303-2eae022f67f5", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } }, "type": "tool_call" }, @@ -47057,16 +35597,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "c_LlCAG8", + "span_id": "AZ60Ocso", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:00.095577+00:00", + "__datetime__": "2025-03-07T01:44:03.907918+00:00", "__module__": "datetime" }, - "trace_id": "zeoAtcHFTnmC8N9f", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 30 + "value": 34 }, { "attributes": { @@ -47074,16 +35614,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "c_LlCAG8", + "span_id": "AZ60Ocso", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:00.095608+00:00", + "__datetime__": "2025-03-07T01:44:03.907933+00:00", "__module__": "datetime" }, - "trace_id": "zeoAtcHFTnmC8N9f", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 28 + "value": 10 }, { "attributes": { @@ -47091,16 +35631,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "c_LlCAG8", + "span_id": "AZ60Ocso", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:17:00.095615+00:00", + "__datetime__": "2025-03-07T01:44:03.907936+00:00", "__module__": "datetime" }, - "trace_id": "zeoAtcHFTnmC8N9f", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 58 + "value": 44 } ] } @@ -47108,7 +35648,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -47136,87 +35676,7 @@ "data": { "event": { "delta": { - "text": "[", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "get_bo", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "iling_point(", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "liquid_name='", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "polyjuice', celci", + "text": "The", "type": "text" }, "event_type": { @@ -47236,7 +35696,7 @@ "data": { "event": { "delta": { - "text": "us=True)]", + "text": " boiling point of polyjuice is -100 degrees Celsius", "type": "text" }, "event_type": { @@ -47256,20 +35716,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "df35163f-539a-47dc-97e3-2569a6ad92fc", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": ".", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -47277,11 +35725,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -47314,16 +35758,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "jzMZxiDn", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:33.874568+00:00", + "__datetime__": "2025-03-07T02:04:33.852666+00:00", "__module__": "datetime" }, - "trace_id": "3v9VtTtdSdGcu8a7", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 30 + "value": 77 }, { "attributes": { @@ -47331,16 +35775,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "jzMZxiDn", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:33.874602+00:00", + "__datetime__": "2025-03-07T02:04:33.852692+00:00", "__module__": "datetime" }, - "trace_id": "3v9VtTtdSdGcu8a7", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 28 + "value": 23 }, { "attributes": { @@ -47348,16 +35792,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "jzMZxiDn", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:33.874608+00:00", + "__datetime__": "2025-03-07T02:04:33.852699+00:00", "__module__": "datetime" }, - "trace_id": "3v9VtTtdSdGcu8a7", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 58 + "value": 100 } ] } @@ -47365,7 +35809,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -47393,7 +35837,7 @@ "data": { "event": { "delta": { - "text": "Poly", + "text": "The", "type": "text" }, "event_type": { @@ -47413,7 +35857,7 @@ "data": { "event": { "delta": { - "text": "juice Potion", + "text": " boiling point of polyjuice is -100 degrees Celsius.", "type": "text" }, "event_type": { @@ -47433,53 +35877,94 @@ "data": { "event": { "delta": { - "text": " is a", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " fictional substance from the Harry Potter", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.617998+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 77 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.618030+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 23 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.618036+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 100 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " book series by J.K", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -47493,7 +35978,7 @@ "data": { "event": { "delta": { - "text": ". Rowling. As it", + "text": "The", "type": "text" }, "event_type": { @@ -47513,7 +35998,7 @@ "data": { "event": { "delta": { - "text": "'s", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -47533,7 +36018,7 @@ "data": { "event": { "delta": { - "text": " not a", + "text": " able", "type": "text" }, "event_type": { @@ -47553,7 +36038,7 @@ "data": { "event": { "delta": { - "text": " real-world substance", + "text": " to find the", "type": "text" }, "event_type": { @@ -47573,7 +36058,7 @@ "data": { "event": { "delta": { - "text": ", it doesn't have a", + "text": " boiling point of \"polyjuice\" as", "type": "text" }, "event_type": { @@ -47593,7 +36078,7 @@ "data": { "event": { "delta": { - "text": " boiling point or", + "text": " it", "type": "text" }, "event_type": { @@ -47613,7 +36098,7 @@ "data": { "event": { "delta": { - "text": " any", + "text": " is not a real liquid", "type": "text" }, "event_type": { @@ -47633,7 +36118,7 @@ "data": { "event": { "delta": { - "text": " other physical properties that can", + "text": ". Polyju", "type": "text" }, "event_type": { @@ -47653,7 +36138,7 @@ "data": { "event": { "delta": { - "text": " be measured.\n\n", + "text": "ice is a fictional substance from the", "type": "text" }, "event_type": { @@ -47673,7 +36158,7 @@ "data": { "event": { "delta": { - "text": "In the", + "text": " Harry Potter series.", "type": "text" }, "event_type": { @@ -47693,53 +36178,94 @@ "data": { "event": { "delta": { - "text": " Harry Potter universe,", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Polyjuice Potion", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232189+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 77 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232325+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 51 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232334+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 128 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " is a", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -47753,7 +36279,7 @@ "data": { "event": { "delta": { - "text": " magical brew that", + "text": "The", "type": "text" }, "event_type": { @@ -47773,7 +36299,7 @@ "data": { "event": { "delta": { - "text": " allows the drinker", + "text": " function call should be", "type": "text" }, "event_type": { @@ -47793,7 +36319,7 @@ "data": { "event": { "delta": { - "text": " to assume", + "text": ":\n[", "type": "text" }, "event_type": { @@ -47813,7 +36339,7 @@ "data": { "event": { "delta": { - "text": " the form and", + "text": "get", "type": "text" }, "event_type": { @@ -47833,7 +36359,7 @@ "data": { "event": { "delta": { - "text": " appearance of another person", + "text": "_boiling_point(liquid_name='polyjuice', celci", "type": "text" }, "event_type": { @@ -47853,7 +36379,7 @@ "data": { "event": { "delta": { - "text": ". Its properties", + "text": "us=True)]", "type": "text" }, "event_type": { @@ -47873,53 +36399,94 @@ "data": { "event": { "delta": { - "text": " and behavior are", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " governed by the rules of magic within the fictional world, rather than", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473221+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 86 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473254+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 34 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473261+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 120 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " by the laws", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -47933,7 +36500,7 @@ "data": { "event": { "delta": { - "text": " of physics and chemistry", + "text": "The", "type": "text" }, "event_type": { @@ -47953,7 +36520,7 @@ "data": { "event": { "delta": { - "text": " that apply to real", + "text": " function `get_boiling_point`", "type": "text" }, "event_type": { @@ -47973,7 +36540,7 @@ "data": { "event": { "delta": { - "text": "-world substances.\n\nSo", + "text": " is not a real function and cannot be", "type": "text" }, "event_type": { @@ -47993,7 +36560,7 @@ "data": { "event": { "delta": { - "text": ", I", + "text": " used to determine the boiling point of polyju", "type": "text" }, "event_type": { @@ -48013,7 +36580,7 @@ "data": { "event": { "delta": { - "text": "'m afraid", + "text": "ice. Polyjuice is a fictional substance from the", "type": "text" }, "event_type": { @@ -48033,7 +36600,7 @@ "data": { "event": { "delta": { - "text": " there's no", + "text": " Harry Potter series and does not have a real-world boiling", "type": "text" }, "event_type": { @@ -48053,7 +36620,7 @@ "data": { "event": { "delta": { - "text": " boiling point to report", + "text": " point. If you have any other questions or need help", "type": "text" }, "event_type": { @@ -48073,7 +36640,7 @@ "data": { "event": { "delta": { - "text": " for Polyjuice", + "text": " with a different topic, feel free to ask!", "type": "text" }, "event_type": { @@ -48093,33 +36660,94 @@ "data": { "event": { "delta": { - "text": " Potion!", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "aCPTIc0d", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:53:27.227208+00:00", + "__module__": "datetime" + }, + "trace_id": "4DRyVE86RpCeqfpE", + "type": "metric", + "unit": "tokens", + "value": 86 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "aCPTIc0d", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:53:27.227251+00:00", + "__module__": "datetime" + }, + "trace_id": "4DRyVE86RpCeqfpE", + "type": "metric", + "unit": "tokens", + "value": 78 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "aCPTIc0d", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:53:27.227258+00:00", + "__module__": "datetime" + }, + "trace_id": "4DRyVE86RpCeqfpE", + "type": "metric", + "unit": "tokens", + "value": 164 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " Would you like to", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -48133,7 +36761,7 @@ "data": { "event": { "delta": { - "text": " know more about the", + "text": "The", "type": "text" }, "event_type": { @@ -48153,7 +36781,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series or", + "text": " function call should be in the following format", "type": "text" }, "event_type": { @@ -48173,7 +36801,7 @@ "data": { "event": { "delta": { - "text": " is there something else", + "text": ": [function_name(parameters)]. However", "type": "text" }, "event_type": { @@ -48193,7 +36821,7 @@ "data": { "event": { "delta": { - "text": " I can help you", + "text": ", the function get_boiling_point is not recognized", "type": "text" }, "event_type": { @@ -48213,7 +36841,7 @@ "data": { "event": { "delta": { - "text": " with?", + "text": ". If the function", "type": "text" }, "event_type": { @@ -48233,94 +36861,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "MD9yQkRd", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:55.850173+00:00", - "__module__": "datetime" - }, - "trace_id": "fh6SDMFUQtK_wjC3", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "MD9yQkRd", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:55.850206+00:00", - "__module__": "datetime" - }, - "trace_id": "fh6SDMFUQtK_wjC3", - "type": "metric", - "unit": "tokens", - "value": 157 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "MD9yQkRd", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:55.850213+00:00", - "__module__": "datetime" - }, - "trace_id": "fh6SDMFUQtK_wjC3", - "type": "metric", - "unit": "tokens", - "value": 187 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " is supposed to return the boiling point of a liquid, it should be defined", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -48334,7 +36881,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": " before it can be used. \n\nIn this", "type": "text" }, "event_type": { @@ -48354,7 +36901,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid", + "text": " case, I will assume that the function get_boiling_point is defined as", "type": "text" }, "event_type": { @@ -48374,7 +36921,7 @@ "data": { "event": { "delta": { - "text": "_name='polyjuice',", + "text": " follows:\ndef get", "type": "text" }, "event_type": { @@ -48394,7 +36941,7 @@ "data": { "event": { "delta": { - "text": " celcius=True)]", + "text": "_boiling_point(liquid_name, celcius=True):\n # This", "type": "text" }, "event_type": { @@ -48414,20 +36961,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "42a89a20-0a36-41cc-83a0-2725428f91b7", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " function returns the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -48435,11 +36970,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -48450,94 +36981,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " boiling point of a liquid in Celcius or Fahrenheit\n boiling_points", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "YObSruYs", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:46.763088+00:00", - "__module__": "datetime" - }, - "trace_id": "ovdyCmhfRdG3MKrj", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "YObSruYs", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:46.763121+00:00", - "__module__": "datetime" - }, - "trace_id": "ovdyCmhfRdG3MKrj", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "YObSruYs", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:46.763131+00:00", - "__module__": "datetime" - }, - "trace_id": "ovdyCmhfRdG3MKrj", - "type": "metric", - "unit": "tokens", - "value": 58 - } - ] + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " = {\n \"water\": 100,\n \"polyjuice\":", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -48551,7 +37021,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " 120 # Assuming poly", "type": "text" }, "event_type": { @@ -48571,7 +37041,7 @@ "data": { "event": { "delta": { - "text": " code defines two", + "text": "juice has a boiling point of 120 degrees Cel", "type": "text" }, "event_type": { @@ -48591,7 +37061,7 @@ "data": { "event": { "delta": { - "text": " functions: `is_prime(n", + "text": "cius\n }\n if liquid", "type": "text" }, "event_type": { @@ -48611,7 +37081,7 @@ "data": { "event": { "delta": { - "text": ")` checks if", + "text": "_name in boiling_points:\n if celcius:\n return", "type": "text" }, "event_type": { @@ -48631,7 +37101,7 @@ "data": { "event": { "delta": { - "text": " a number `n` is", + "text": " boiling_points[liquid_name]\n else:\n return boiling_points[liquid", "type": "text" }, "event_type": { @@ -48651,7 +37121,7 @@ "data": { "event": { "delta": { - "text": " prime, and `nth_prime", + "text": "_name] * 9/5 + ", "type": "text" }, "event_type": { @@ -48671,7 +37141,7 @@ "data": { "event": { "delta": { - "text": "(n)` finds the `n", + "text": "32\n else:\n return \"Boiling point not found", "type": "text" }, "event_type": { @@ -48691,7 +37161,7 @@ "data": { "event": { "delta": { - "text": "`th prime number. The", + "text": "\"\n\nNow, the function call", "type": "text" }, "event_type": { @@ -48711,7 +37181,7 @@ "data": { "event": { "delta": { - "text": " `is_prime(n)` function checks", + "text": " should be: \n", "type": "text" }, "event_type": { @@ -48731,7 +37201,7 @@ "data": { "event": { "delta": { - "text": " if", + "text": "[get_boiling_point(liquid_name=\"polyju", "type": "text" }, "event_type": { @@ -48751,7 +37221,7 @@ "data": { "event": { "delta": { - "text": " `n", + "text": "ice\", celcius=True)]", "type": "text" }, "event_type": { @@ -48771,13 +37241,94 @@ "data": { "event": { "delta": { - "text": "` is less than or equal", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "NnkGeCwM", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:35.213901+00:00", + "__module__": "datetime" + }, + "trace_id": "7ifSRjCjRIioDOte", + "type": "metric", + "unit": "tokens", + "value": 86 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "NnkGeCwM", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:35.213925+00:00", + "__module__": "datetime" + }, + "trace_id": "7ifSRjCjRIioDOte", + "type": "metric", + "unit": "tokens", + "value": 234 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "NnkGeCwM", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:35.213931+00:00", + "__module__": "datetime" + }, + "trace_id": "7ifSRjCjRIioDOte", + "type": "metric", + "unit": "tokens", + "value": 320 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -48791,7 +37342,7 @@ "data": { "event": { "delta": { - "text": " to 1 (not", + "text": "[", "type": "text" }, "event_type": { @@ -48811,7 +37362,7 @@ "data": { "event": { "delta": { - "text": " prime), less than or equal", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -48831,7 +37382,7 @@ "data": { "event": { "delta": { - "text": " to 3", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -48851,8 +37402,20 @@ "data": { "event": { "delta": { - "text": " (prime), or if it", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "d43b2636-903d-430d-8389-91eefe5a1d75", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -48860,7 +37423,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -48871,13 +37438,94 @@ "data": { "event": { "delta": { - "text": "'s divisible by 2 or", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221646+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", + "value": 30 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221673+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221680+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", + "value": 58 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -48891,7 +37539,7 @@ "data": { "event": { "delta": { - "text": " 3 (not prime).", + "text": "[", "type": "text" }, "event_type": { @@ -48911,7 +37559,7 @@ "data": { "event": { "delta": { - "text": " If none of", + "text": "get_boiling_point(liquid_name", "type": "text" }, "event_type": { @@ -48931,7 +37579,7 @@ "data": { "event": { "delta": { - "text": " these conditions are met, it", + "text": "='polyjuice', celcius=True)]", "type": "text" }, "event_type": { @@ -48951,8 +37599,20 @@ "data": { "event": { "delta": { - "text": " checks divisibility", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "0548b2ef-daa4-4099-bb2c-b34f00752339", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -48960,7 +37620,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -48971,33 +37635,94 @@ "data": { "event": { "delta": { - "text": " by numbers", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366139+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", + "value": 30 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366166+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366172+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", + "value": 58 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " of the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -49011,7 +37736,7 @@ "data": { "event": { "delta": { - "text": " form 6k \u00b1 ", + "text": "Poly", "type": "text" }, "event_type": { @@ -49031,7 +37756,7 @@ "data": { "event": { "delta": { - "text": "1, where", + "text": "juice is a fictional potion from", "type": "text" }, "event_type": { @@ -49051,7 +37776,7 @@ "data": { "event": { "delta": { - "text": " k is an integer. The", + "text": " the Harry Potter series by J.K. Rowling. As it", "type": "text" }, "event_type": { @@ -49071,7 +37796,7 @@ "data": { "event": { "delta": { - "text": " `nth_prime(n", + "text": "'s not a real substance, it doesn't have a boiling point", "type": "text" }, "event_type": { @@ -49091,7 +37816,7 @@ "data": { "event": { "delta": { - "text": ")` function iterates", + "text": ". Polyjuice Potion is a magical concoction", "type": "text" }, "event_type": { @@ -49111,7 +37836,7 @@ "data": { "event": { "delta": { - "text": " through numbers", + "text": " that allows the drinker to assume the form and", "type": "text" }, "event_type": { @@ -49131,7 +37856,7 @@ "data": { "event": { "delta": { - "text": " starting from 2, checks", + "text": " appearance", "type": "text" }, "event_type": { @@ -49151,7 +37876,7 @@ "data": { "event": { "delta": { - "text": " if each number is prime using", + "text": " of another person, but it's not a physical substance that can", "type": "text" }, "event_type": { @@ -49171,7 +37896,7 @@ "data": { "event": { "delta": { - "text": " the `is_prime(n", + "text": " be measured or analyzed in the same way as real-world", "type": "text" }, "event_type": { @@ -49191,7 +37916,7 @@ "data": { "event": { "delta": { - "text": ")` function, and increments a", + "text": " chemicals.\n\nIf you", "type": "text" }, "event_type": { @@ -49211,7 +37936,7 @@ "data": { "event": { "delta": { - "text": " counter until it", + "text": " have any other questions or", "type": "text" }, "event_type": { @@ -49231,7 +37956,7 @@ "data": { "event": { "delta": { - "text": " finds the `n`th", + "text": " if there's anything else I can help you with, feel free to ask", "type": "text" }, "event_type": { @@ -49251,7 +37976,7 @@ "data": { "event": { "delta": { - "text": " prime number.", + "text": "!", "type": "text" }, "event_type": { @@ -49293,16 +38018,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "DP8eFcnZ", + "span_id": "M0oC9v8Y", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:11.055365+00:00", + "__datetime__": "2025-03-07T02:04:30.531648+00:00", "__module__": "datetime" }, - "trace_id": "eUBw_VOpS32wNAGH", + "trace_id": "0CMlh2kQShSVm3zE", "type": "metric", "unit": "tokens", - "value": 252 + "value": 30 }, { "attributes": { @@ -49310,16 +38035,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "DP8eFcnZ", + "span_id": "M0oC9v8Y", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:11.055474+00:00", + "__datetime__": "2025-03-07T02:04:30.531666+00:00", "__module__": "datetime" }, - "trace_id": "eUBw_VOpS32wNAGH", + "trace_id": "0CMlh2kQShSVm3zE", "type": "metric", "unit": "tokens", - "value": 171 + "value": 113 }, { "attributes": { @@ -49327,16 +38052,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "DP8eFcnZ", + "span_id": "M0oC9v8Y", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:11.055486+00:00", + "__datetime__": "2025-03-07T02:04:30.531671+00:00", "__module__": "datetime" }, - "trace_id": "eUBw_VOpS32wNAGH", + "trace_id": "0CMlh2kQShSVm3zE", "type": "metric", "unit": "tokens", - "value": 423 + "value": 143 } ] } @@ -49344,7 +38069,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -49372,13 +38097,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "[", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49397,13 +38117,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "def is_prime(n):\n ", - "type": "tool_call" + "text": "get_boiling_point(liquid_name='polyjuice', cel", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49422,13 +38137,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " if n <= 1:\n return", - "type": "tool_call" + "text": "cius=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49450,34 +38160,16 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" }, - "tool_call": " False\n if", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "acbb04a1-08f4-4277-9b66-aadda2fa2be7", + "tool_name": "get_boiling_point" }, - "tool_call": " n <= 3:\n ", "type": "tool_call" }, "event_type": { @@ -49486,7 +38178,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -49497,118 +38193,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return True\n if n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" }, - "tool_call": " % 2 == 0", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "metric": "prompt_tokens", + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175063+00:00", + "__module__": "datetime" }, - "tool_call": " or n % 3 ==", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", + "value": 30 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" }, - "tool_call": " 0:\n return False", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "metric": "completion_tokens", + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175128+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175137+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", + "value": 58 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n i", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -49619,16 +38291,11 @@ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " = 5", - "type": "tool_call" + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49647,13 +38314,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n while i * i", - "type": "tool_call" + "text": " 100th prime number is 541", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49672,13 +38334,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " <= n:\n if n", - "type": "tool_call" + "text": ".", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49697,43 +38354,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " % i == 0 or", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404182+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 252 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404224+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 20 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404230+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 272 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " n % (i + 2", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -49750,9 +38458,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "started" }, - "tool_call": ") == 0:\n return False", + "tool_call": "", "type": "tool_call" }, "event_type": { @@ -49777,7 +38485,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n i +=", + "tool_call": "def is_prime(n):\n if n <= 1:\n return False", "type": "tool_call" }, "event_type": { @@ -49802,7 +38510,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 6\n return True", + "tool_call": "\n if n <= 3:\n return True", "type": "tool_call" }, "event_type": { @@ -49827,7 +38535,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n\ndef nth_prime(n):\n", + "tool_call": "\n if n % 2 == 0 or n % 3", "type": "tool_call" }, "event_type": { @@ -49852,7 +38560,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " count = 0\n", + "tool_call": " == 0:\n return False\n i = 5\n ", "type": "tool_call" }, "event_type": { @@ -49877,7 +38585,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " num = 2\n", + "tool_call": " while i * i <= n:\n if n", "type": "tool_call" }, "event_type": { @@ -49902,7 +38610,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " while True:\n if", + "tool_call": " % i == 0 or n % (i", "type": "tool_call" }, "event_type": { @@ -49927,7 +38635,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " is_prime(num):\n", + "tool_call": " + 2) == 0:\n return False\n i +=", "type": "tool_call" }, "event_type": { @@ -49952,7 +38660,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " count += 1\n ", + "tool_call": " 6\n return True\n\ndef nth_prime(n):\n count =", "type": "tool_call" }, "event_type": { @@ -49977,7 +38685,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " if count == n:\n ", + "tool_call": " 0\n num = 2\n while True:\n if", "type": "tool_call" }, "event_type": { @@ -50002,7 +38710,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " return num\n num +=", + "tool_call": " is_prime(num):\n count += 1\n if count == n", "type": "tool_call" }, "event_type": { @@ -50027,7 +38735,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 1\n\nprint(nth", + "tool_call": ":\n return num\n num += 1\n\nprint(nth_prime", "type": "tool_call" }, "event_type": { @@ -50052,7 +38760,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_prime(100))", + "tool_call": "(100))", "type": "tool_call" }, "event_type": { @@ -50081,7 +38789,7 @@ "arguments": { "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" }, - "call_id": "9859b184-8882-4553-8e81-97c304a4fa9b", + "call_id": "e1110bc1-dc83-480d-ad33-09d49f5ccc8d", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -50133,13 +38841,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "coI936YN", + "span_id": "5J3hM-La", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:07.172040+00:00", + "__datetime__": "2025-03-07T01:44:09.121100+00:00", "__module__": "datetime" }, - "trace_id": "eUBw_VOpS32wNAGH", + "trace_id": "snO106yxStaL10ow", "type": "metric", "unit": "tokens", "value": 40 @@ -50150,13 +38858,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "coI936YN", + "span_id": "5J3hM-La", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:07.172411+00:00", + "__datetime__": "2025-03-07T01:44:09.121127+00:00", "__module__": "datetime" }, - "trace_id": "eUBw_VOpS32wNAGH", + "trace_id": "snO106yxStaL10ow", "type": "metric", "unit": "tokens", "value": 10 @@ -50167,13 +38875,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "coI936YN", + "span_id": "5J3hM-La", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:16:07.172422+00:00", + "__datetime__": "2025-03-07T01:44:09.121132+00:00", "__module__": "datetime" }, - "trace_id": "eUBw_VOpS32wNAGH", + "trace_id": "snO106yxStaL10ow", "type": "metric", "unit": "tokens", "value": 50 @@ -50232,47 +38940,7 @@ "data": { "event": { "delta": { - "text": "plexity the company", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " was founded in 2022", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", + "text": "plexity the company was founded in 2022.", "type": "text" }, "event_type": { @@ -50314,13 +38982,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "85ppLY3v", + "span_id": "6jxCq3gU", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:01.989283+00:00", + "__datetime__": "2025-03-07T01:45:50.430436+00:00", "__module__": "datetime" }, - "trace_id": "he4nc6x1QZ6pWLtN", + "trace_id": "XhZWljYTTDCYF7vI", "type": "metric", "unit": "tokens", "value": 68 @@ -50331,13 +38999,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "85ppLY3v", + "span_id": "6jxCq3gU", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:01.989312+00:00", + "__datetime__": "2025-03-07T01:45:50.430477+00:00", "__module__": "datetime" }, - "trace_id": "he4nc6x1QZ6pWLtN", + "trace_id": "XhZWljYTTDCYF7vI", "type": "metric", "unit": "tokens", "value": 22 @@ -50348,13 +39016,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "85ppLY3v", + "span_id": "6jxCq3gU", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:01.989316+00:00", + "__datetime__": "2025-03-07T01:45:50.430489+00:00", "__module__": "datetime" }, - "trace_id": "he4nc6x1QZ6pWLtN", + "trace_id": "XhZWljYTTDCYF7vI", "type": "metric", "unit": "tokens", "value": 90 @@ -50413,27 +39081,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Perplexity the company", + "text": "nowledge_search(query=\"Perplexity the company", "type": "text" }, "event_type": { @@ -50482,7 +39130,7 @@ "arguments": { "query": "Perplexity the company founding date" }, - "call_id": "5845398b-1978-4138-9e72-800b65cf8fe7", + "call_id": "199ef050-bc11-4e4b-935d-f5241c3f40ef", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -50530,13 +39178,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "V20HYcJc", + "span_id": "m4wMGuSN", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:01.118353+00:00", + "__datetime__": "2025-03-07T01:45:49.880525+00:00", "__module__": "datetime" }, - "trace_id": "he4nc6x1QZ6pWLtN", + "trace_id": "XhZWljYTTDCYF7vI", "type": "metric", "unit": "tokens", "value": 29 @@ -50547,13 +39195,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "V20HYcJc", + "span_id": "m4wMGuSN", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:01.118394+00:00", + "__datetime__": "2025-03-07T01:45:49.880576+00:00", "__module__": "datetime" }, - "trace_id": "he4nc6x1QZ6pWLtN", + "trace_id": "XhZWljYTTDCYF7vI", "type": "metric", "unit": "tokens", "value": 23 @@ -50564,13 +39212,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "V20HYcJc", + "span_id": "m4wMGuSN", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:01.118408+00:00", + "__datetime__": "2025-03-07T01:45:49.880585+00:00", "__module__": "datetime" }, - "trace_id": "he4nc6x1QZ6pWLtN", + "trace_id": "XhZWljYTTDCYF7vI", "type": "metric", "unit": "tokens", "value": 52 @@ -50629,87 +39277,7 @@ "data": { "event": { "delta": { - "text": " NBA was created on August ", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "3, 1949,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " with the merger of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the Basketball Association of America (", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "BAA) and the National", + "text": " NBA was created on August 3, 1949, with", "type": "text" }, "event_type": { @@ -50729,7 +39297,7 @@ "data": { "event": { "delta": { - "text": " Basketball League (N", + "text": " the merger of the Basketball Association of America (BAA) and the National", "type": "text" }, "event_type": { @@ -50749,7 +39317,7 @@ "data": { "event": { "delta": { - "text": "BL).", + "text": " Basketball League (NBL).", "type": "text" }, "event_type": { @@ -50791,13 +39359,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "2mRpWtE_", + "span_id": "OyfVMRgR", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:08.397090+00:00", + "__datetime__": "2025-03-07T01:45:53.322420+00:00", "__module__": "datetime" }, - "trace_id": "Xa0aO1SdQOiEqarh", + "trace_id": "TMrhR55CR-KrmGp0", "type": "metric", "unit": "tokens", "value": 63 @@ -50808,13 +39376,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "2mRpWtE_", + "span_id": "OyfVMRgR", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:08.397117+00:00", + "__datetime__": "2025-03-07T01:45:53.322482+00:00", "__module__": "datetime" }, - "trace_id": "Xa0aO1SdQOiEqarh", + "trace_id": "TMrhR55CR-KrmGp0", "type": "metric", "unit": "tokens", "value": 45 @@ -50825,13 +39393,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "2mRpWtE_", + "span_id": "OyfVMRgR", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:08.397124+00:00", + "__datetime__": "2025-03-07T01:45:53.322490+00:00", "__module__": "datetime" }, - "trace_id": "Xa0aO1SdQOiEqarh", + "trace_id": "TMrhR55CR-KrmGp0", "type": "metric", "unit": "tokens", "value": 108 @@ -50890,27 +39458,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "NBA creation date\")]", + "text": "nowledge_search(query=\"NBA creation date\")]", "type": "text" }, "event_type": { @@ -50939,7 +39487,7 @@ "arguments": { "query": "NBA creation date" }, - "call_id": "5855dd61-6243-4922-a110-f072de222c69", + "call_id": "388e55ab-448a-4a98-905b-196c051bdeea", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -50987,13 +39535,13 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "Rn6uhM71", + "span_id": "QpFMmy3B", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:03.628231+00:00", + "__datetime__": "2025-03-07T01:45:52.235138+00:00", "__module__": "datetime" }, - "trace_id": "Xa0aO1SdQOiEqarh", + "trace_id": "TMrhR55CR-KrmGp0", "type": "metric", "unit": "tokens", "value": 27 @@ -51004,13 +39552,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "Rn6uhM71", + "span_id": "QpFMmy3B", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:03.628288+00:00", + "__datetime__": "2025-03-07T01:45:52.235160+00:00", "__module__": "datetime" }, - "trace_id": "Xa0aO1SdQOiEqarh", + "trace_id": "TMrhR55CR-KrmGp0", "type": "metric", "unit": "tokens", "value": 20 @@ -51021,13 +39569,13 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "Rn6uhM71", + "span_id": "QpFMmy3B", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-12T23:18:03.628296+00:00", + "__datetime__": "2025-03-07T01:45:52.235165+00:00", "__module__": "datetime" }, - "trace_id": "Xa0aO1SdQOiEqarh", + "trace_id": "TMrhR55CR-KrmGp0", "type": "metric", "unit": "tokens", "value": 47 diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index ab9793303f..76191e992f 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -64,19 +64,6 @@ } } }, - "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { - "type": "value", - "value": { - "__module__": "llama_stack.apis.tools.tools", - "__pydantic__": "ToolInvocationResult", - "data": { - "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", - "error_code": null, - "error_message": null, - "metadata": null - } - } - }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -387,23 +374,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:7f524\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:b49f7\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:4bcdb\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:c553d\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:b49f7\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -415,11 +402,11 @@ "error_message": null, "metadata": { "document_ids": [ - "7f5245b0-58f9-44bb-8047-1c5e6c943496", - "4bcdb054-da63-48cb-b9c6-c38397569929", - "c553db3b-8eee-4027-b5b5-696fa7b6505e", - "4bcdb054-da63-48cb-b9c6-c38397569929", - "c553db3b-8eee-4027-b5b5-696fa7b6505e" + "24443dfb-a0b3-4ce8-820e-3fb1f12364bb", + "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", + "b49f7985-6615-4dcf-99be-d1765b6a6fc6", + "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", + "b49f7985-6615-4dcf-99be-d1765b6a6fc6" ] } } @@ -431,7 +418,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", + "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\", \"url\": \"https://observer.com/2024/01/meta-facebook-top-executives/\", \"content\": \"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\", \"score\": 0.45536873, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", "error_code": null, "error_message": null, "metadata": null @@ -450,23 +437,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:4bcdb\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:20e5d\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:4bcdb\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:20e5d\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:4bcdb\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe Date: Wed, 12 Mar 2025 16:47:22 -0700 Subject: [PATCH 05/14] datasetio --- tests/integration/datasetio/test_datasetio.py | 17 +++++++++-- tests/integration/scoring/test_scoring.py | 28 +++++++++---------- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py index 5b1d1a37a0..6dddf59157 100644 --- a/tests/integration/datasetio/test_datasetio.py +++ b/tests/integration/datasetio/test_datasetio.py @@ -9,10 +9,23 @@ import os from pathlib import Path +import pytest + # How to run this test: # # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasetio +@pytest.fixture +def test_dataset(llama_stack_client): + register_dataset(llama_stack_client) + yield # This is where the test function will run + + # Teardown - this always runs, even if the test fails + try: + llama_stack_client.datasets.unregister("test_dataset") + except Exception as e: + print(f"Warning: Failed to unregister test_dataset: {e}") + def data_url_from_file(file_path: str) -> str: if not os.path.exists(file_path): @@ -80,8 +93,7 @@ def test_register_unregister_dataset(llama_stack_client): assert len(response) == 0 -def test_get_rows_paginated(llama_stack_client): - register_dataset(llama_stack_client) +def test_get_rows_paginated(llama_stack_client, test_dataset): response = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", rows_in_page=3, @@ -99,4 +111,3 @@ def test_get_rows_paginated(llama_stack_client): assert isinstance(response.rows, list) assert len(response.rows) == 2 assert response.next_page_token == "5" - llama_stack_client.datasets.unregister("test_dataset") diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index 6c2623705e..3477516a22 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -9,6 +9,17 @@ from ..datasetio.test_datasetio import register_dataset +@pytest.fixture +def test_dataset_rag(llama_stack_client): + register_dataset(llama_stack_client, for_rag=True) + yield # This is where the test function will run + + # Teardown - this always runs, even if the test fails + try: + llama_stack_client.datasets.unregister("test_dataset") + except Exception as e: + print(f"Warning: Failed to unregister test_dataset: {e}") + @pytest.fixture def sample_judge_prompt_template(): @@ -79,9 +90,7 @@ def test_scoring_functions_register( # TODO: add unregister api for scoring functions -def test_scoring_score(llama_stack_client): - register_dataset(llama_stack_client, for_rag=True) - +def test_scoring_score(llama_stack_client, test_dataset_rag): # scoring individual rows rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", @@ -114,12 +123,8 @@ def test_scoring_score(llama_stack_client): assert x in response.results assert len(response.results[x].score_rows) == 5 - llama_stack_client.datasets.unregister("test_dataset") - - -def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id): - register_dataset(llama_stack_client, for_rag=True) +def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id, test_dataset_rag): # scoring individual rows rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", @@ -159,8 +164,6 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge assert x in response.results assert len(response.results[x].score_rows) == 5 - llama_stack_client.datasets.unregister("test_dataset") - @pytest.mark.parametrize( "provider_id", @@ -171,9 +174,8 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge ], ) def test_scoring_score_with_aggregation_functions( - llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id + llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id, test_dataset_rag ): - register_dataset(llama_stack_client, for_rag=True) rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", rows_in_page=3, @@ -227,5 +229,3 @@ def test_scoring_score_with_aggregation_functions( assert x in response.results assert len(response.results[x].score_rows) == len(rows.rows) assert len(response.results[x].aggregated_results) == len(aggr_fns) - - llama_stack_client.datasets.unregister("test_dataset") From 9dcf617556acf1f8e7b15a6a5f2d78ba9dd0d84e Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 16:47:40 -0700 Subject: [PATCH 06/14] datasetio --- tests/integration/datasetio/test_datasetio.py | 3 ++- tests/integration/scoring/test_scoring.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py index 6dddf59157..64e187fb5d 100644 --- a/tests/integration/datasetio/test_datasetio.py +++ b/tests/integration/datasetio/test_datasetio.py @@ -15,11 +15,12 @@ # # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasetio + @pytest.fixture def test_dataset(llama_stack_client): register_dataset(llama_stack_client) yield # This is where the test function will run - + # Teardown - this always runs, even if the test fails try: llama_stack_client.datasets.unregister("test_dataset") diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index 3477516a22..a2d8500d58 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -9,11 +9,12 @@ from ..datasetio.test_datasetio import register_dataset + @pytest.fixture def test_dataset_rag(llama_stack_client): register_dataset(llama_stack_client, for_rag=True) yield # This is where the test function will run - + # Teardown - this always runs, even if the test fails try: llama_stack_client.datasets.unregister("test_dataset") @@ -124,7 +125,9 @@ def test_scoring_score(llama_stack_client, test_dataset_rag): assert len(response.results[x].score_rows) == 5 -def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id, test_dataset_rag): +def test_scoring_score_with_params_llm_as_judge( + llama_stack_client, sample_judge_prompt_template, judge_model_id, test_dataset_rag +): # scoring individual rows rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", From 4948cda0da927702f41d7a6fe99df47e515872ea Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 16:59:02 -0700 Subject: [PATCH 07/14] regen --- .../recorded_responses/chat_completion.json | 31931 ++++++++++++++-- .../recorded_responses/invoke_tool.json | 81 +- 2 files changed, 28144 insertions(+), 3868 deletions(-) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 7234b6c31d..71fbc9361b 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -26786,7 +26786,7 @@ "data": { "event": { "delta": { - "text": " provided function definitions", + "text": " provided function definitions are", "type": "text" }, "event_type": { @@ -26806,7 +26806,7 @@ "data": { "event": { "delta": { - "text": " are not suitable", + "text": " not suitable", "type": "text" }, "event_type": { @@ -26826,7 +26826,7 @@ "data": { "event": { "delta": { - "text": " for this task. Please re", + "text": " for this", "type": "text" }, "event_type": { @@ -26846,7 +26846,7 @@ "data": { "event": { "delta": { - "text": "work them to", + "text": " task", "type": "text" }, "event_type": { @@ -26866,7 +26866,7 @@ "data": { "event": { "delta": { - "text": " align with the task requirements.", + "text": ". Please", "type": "text" }, "event_type": { @@ -26886,94 +26886,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "D2n_IS_8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021393+00:00", - "__module__": "datetime" - }, - "trace_id": "amAiZv5PQKSsA74j", - "type": "metric", - "unit": "tokens", - "value": 90 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "D2n_IS_8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021420+00:00", - "__module__": "datetime" - }, - "trace_id": "amAiZv5PQKSsA74j", - "type": "metric", - "unit": "tokens", - "value": 32 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "D2n_IS_8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021427+00:00", - "__module__": "datetime" - }, - "trace_id": "amAiZv5PQKSsA74j", - "type": "metric", - "unit": "tokens", - "value": 122 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " rework", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -26987,7 +26906,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": " them to", "type": "text" }, "event_type": { @@ -27007,7 +26926,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": " align with", "type": "text" }, "event_type": { @@ -27027,7 +26946,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": " the task", "type": "text" }, "event_type": { @@ -27047,20 +26966,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " requirements.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -27068,11 +26975,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -27100,55 +27003,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "YhFB39Ik", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335148+00:00", - "__module__": "datetime" - }, - "trace_id": "3n2xEtjLQt6ZGVR_", - "type": "metric", - "unit": "tokens", - "value": 267 + "unit": null, + "value": 90 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "YhFB39Ik", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335179+00:00", - "__module__": "datetime" - }, - "trace_id": "3n2xEtjLQt6ZGVR_", - "type": "metric", - "unit": "tokens", - "value": 28 + "unit": null, + "value": 32 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "YhFB39Ik", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335185+00:00", - "__module__": "datetime" - }, - "trace_id": "3n2xEtjLQt6ZGVR_", - "type": "metric", - "unit": "tokens", - "value": 295 + "unit": null, + "value": 122 } ] } @@ -27156,7 +27023,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27254,7 +27121,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", + "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27302,16 +27169,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "lnqeV_cZ", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708270+00:00", + "__datetime__": "2025-03-07T01:44:31.335148+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", - "value": 211 + "value": 267 }, { "attributes": { @@ -27319,13 +27186,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "lnqeV_cZ", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708281+00:00", + "__datetime__": "2025-03-07T01:44:31.335179+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", "value": 28 @@ -27336,16 +27203,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "lnqeV_cZ", + "span_id": "YhFB39Ik", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708284+00:00", + "__datetime__": "2025-03-07T01:44:31.335185+00:00", "__module__": "datetime" }, - "trace_id": "me4qbUSCQ5yKvrAG", + "trace_id": "3n2xEtjLQt6ZGVR_", "type": "metric", "unit": "tokens", - "value": 239 + "value": 295 } ] } @@ -27353,7 +27220,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27451,7 +27318,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", + "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27499,16 +27366,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "TDJHPVDZ", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195776+00:00", + "__datetime__": "2025-03-07T01:44:29.708270+00:00", "__module__": "datetime" }, - "trace_id": "r2GKj8iqTYaNxTeq", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", - "value": 155 + "value": 211 }, { "attributes": { @@ -27516,13 +27383,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "TDJHPVDZ", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195808+00:00", + "__datetime__": "2025-03-07T01:44:29.708281+00:00", "__module__": "datetime" }, - "trace_id": "r2GKj8iqTYaNxTeq", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", "value": 28 @@ -27533,16 +27400,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "TDJHPVDZ", + "span_id": "lnqeV_cZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195814+00:00", + "__datetime__": "2025-03-07T01:44:29.708284+00:00", "__module__": "datetime" }, - "trace_id": "r2GKj8iqTYaNxTeq", + "trace_id": "me4qbUSCQ5yKvrAG", "type": "metric", "unit": "tokens", - "value": 183 + "value": 239 } ] } @@ -27550,7 +27417,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27648,7 +27515,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "135d468e-6391-401d-a3c0-3b08c3a6eb8c", + "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27696,16 +27563,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "8pZtsyNW", + "span_id": "TDJHPVDZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321089+00:00", + "__datetime__": "2025-03-07T01:44:28.195776+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "r2GKj8iqTYaNxTeq", "type": "metric", "unit": "tokens", - "value": 99 + "value": 155 }, { "attributes": { @@ -27713,13 +27580,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "8pZtsyNW", + "span_id": "TDJHPVDZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321130+00:00", + "__datetime__": "2025-03-07T01:44:28.195808+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "r2GKj8iqTYaNxTeq", "type": "metric", "unit": "tokens", "value": 28 @@ -27730,16 +27597,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "8pZtsyNW", + "span_id": "TDJHPVDZ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:47:51.321140+00:00", + "__datetime__": "2025-03-07T01:44:28.195814+00:00", "__module__": "datetime" }, - "trace_id": "1Ly70plQQGel5jgc", + "trace_id": "r2GKj8iqTYaNxTeq", "type": "metric", "unit": "tokens", - "value": 127 + "value": 183 } ] } @@ -27747,7 +27614,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27795,7 +27662,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -27815,7 +27682,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": "=True)]", "type": "text" }, "event_type": { @@ -27845,7 +27712,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "3955f756-9aa0-433f-be8f-af8941c220de", + "call_id": "135d468e-6391-401d-a3c0-3b08c3a6eb8c", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -27893,16 +27760,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "QZ6PSGpT", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629456+00:00", + "__datetime__": "2025-03-07T01:47:51.321089+00:00", "__module__": "datetime" }, - "trace_id": "M72bosg8TBe3uhx3", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", - "value": 43 + "value": 99 }, { "attributes": { @@ -27910,13 +27777,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "QZ6PSGpT", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629488+00:00", + "__datetime__": "2025-03-07T01:47:51.321130+00:00", "__module__": "datetime" }, - "trace_id": "M72bosg8TBe3uhx3", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", "value": 28 @@ -27927,16 +27794,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "QZ6PSGpT", + "span_id": "8pZtsyNW", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629494+00:00", + "__datetime__": "2025-03-07T01:47:51.321140+00:00", "__module__": "datetime" }, - "trace_id": "M72bosg8TBe3uhx3", + "trace_id": "1Ly70plQQGel5jgc", "type": "metric", "unit": "tokens", - "value": 71 + "value": 127 } ] } @@ -27944,7 +27811,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27972,7 +27839,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[", "type": "text" }, "event_type": { @@ -27992,7 +27859,7 @@ "data": { "event": { "delta": { - "text": " function call returned an", + "text": "get_bo", "type": "text" }, "event_type": { @@ -28012,7 +27879,7 @@ "data": { "event": { "delta": { - "text": " error since", + "text": "iling_point", "type": "text" }, "event_type": { @@ -28032,7 +27899,7 @@ "data": { "event": { "delta": { - "text": " \"", + "text": "(liquid", "type": "text" }, "event_type": { @@ -28052,7 +27919,7 @@ "data": { "event": { "delta": { - "text": "polyjuice\" is", + "text": "_name='", "type": "text" }, "event_type": { @@ -28072,7 +27939,7 @@ "data": { "event": { "delta": { - "text": " not a real liquid. Polyju", + "text": "polyju", "type": "text" }, "event_type": { @@ -28092,7 +27959,7 @@ "data": { "event": { "delta": { - "text": "ice is a fictional substance from the", + "text": "ice',", "type": "text" }, "event_type": { @@ -28112,7 +27979,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series. The boiling point", + "text": " celci", "type": "text" }, "event_type": { @@ -28132,7 +27999,7 @@ "data": { "event": { "delta": { - "text": " of a substance is a physical", + "text": "us=True", "type": "text" }, "event_type": { @@ -28152,7 +28019,7 @@ "data": { "event": { "delta": { - "text": " property that can be measured and", + "text": ")]", "type": "text" }, "event_type": { @@ -28172,48 +28039,20 @@ "data": { "event": { "delta": { - "text": " quantified", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", but it only applies", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to real substances that exist in the physical world.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "a1515d69-2f3f-43a3-9d1d-7b78b2c9fb5a", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -28221,7 +28060,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -28249,55 +28092,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "y9SHtJTQ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411612+00:00", - "__module__": "datetime" - }, - "trace_id": "_I2Cu85IRtOSBSX9", - "type": "metric", - "unit": "tokens", - "value": 84 + "unit": null, + "value": 43 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "y9SHtJTQ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411644+00:00", - "__module__": "datetime" - }, - "trace_id": "_I2Cu85IRtOSBSX9", - "type": "metric", - "unit": "tokens", - "value": 73 + "unit": null, + "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "y9SHtJTQ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411650+00:00", - "__module__": "datetime" - }, - "trace_id": "_I2Cu85IRtOSBSX9", - "type": "metric", - "unit": "tokens", - "value": 157 + "unit": null, + "value": 71 } ] } @@ -28305,7 +28112,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28353,7 +28160,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": " function get", "type": "text" }, "event_type": { @@ -28373,7 +28180,7 @@ "data": { "event": { "delta": { - "text": " recognized.", + "text": "_boiling", "type": "text" }, "event_type": { @@ -28393,94 +28200,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "_point is", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "Z7jBGJ-8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401637+00:00", - "__module__": "datetime" - }, - "trace_id": "WxMAq579Q-ixJ3wJ", - "type": "metric", - "unit": "tokens", - "value": 93 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "Z7jBGJ-8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401666+00:00", - "__module__": "datetime" - }, - "trace_id": "WxMAq579Q-ixJ3wJ", - "type": "metric", - "unit": "tokens", - "value": 20 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "Z7jBGJ-8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401670+00:00", - "__module__": "datetime" - }, - "trace_id": "WxMAq579Q-ixJ3wJ", - "type": "metric", - "unit": "tokens", - "value": 113 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " not able", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -28494,7 +28240,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " to determine", "type": "text" }, "event_type": { @@ -28514,7 +28260,7 @@ "data": { "event": { "delta": { - "text": " function get_bo", + "text": " the boiling", "type": "text" }, "event_type": { @@ -28534,7 +28280,7 @@ "data": { "event": { "delta": { - "text": "iling_point_with_metadata does not exist,", + "text": " point of", "type": "text" }, "event_type": { @@ -28554,7 +28300,7 @@ "data": { "event": { "delta": { - "text": " I will", + "text": " \"polyju", "type": "text" }, "event_type": { @@ -28574,7 +28320,7 @@ "data": { "event": { "delta": { - "text": " assume you", + "text": "ice\"", "type": "text" }, "event_type": { @@ -28594,7 +28340,7 @@ "data": { "event": { "delta": { - "text": " meant get_bo", + "text": " as it", "type": "text" }, "event_type": { @@ -28614,7 +28360,7 @@ "data": { "event": { "delta": { - "text": "iling_point_with_metadata", + "text": " is not", "type": "text" }, "event_type": { @@ -28634,7 +28380,7 @@ "data": { "event": { "delta": { - "text": ". The boiling point of polyjuice", + "text": " a real", "type": "text" }, "event_type": { @@ -28654,7 +28400,7 @@ "data": { "event": { "delta": { - "text": " is -100.", + "text": " liquid.", "type": "text" }, "event_type": { @@ -28674,94 +28420,53 @@ "data": { "event": { "delta": { - "text": "", + "text": " Poly", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "8dM6i5mO", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329281+00:00", - "__module__": "datetime" - }, - "trace_id": "zMJDP5dXRrChi7uE", - "type": "metric", - "unit": "tokens", - "value": 86 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "juice", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "8dM6i5mO", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329312+00:00", - "__module__": "datetime" - }, - "trace_id": "zMJDP5dXRrChi7uE", - "type": "metric", - "unit": "tokens", - "value": 45 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "8dM6i5mO", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329318+00:00", - "__module__": "datetime" - }, - "trace_id": "zMJDP5dXRrChi7uE", - "type": "metric", - "unit": "tokens", - "value": 131 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " is a", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -28775,7 +28480,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " fictional substance", "type": "text" }, "event_type": { @@ -28795,7 +28500,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point_with_metadata(", + "text": " from the", "type": "text" }, "event_type": { @@ -28815,7 +28520,7 @@ "data": { "event": { "delta": { - "text": "liquid_name=\"polyjuice\", celcius=True) should be", + "text": " Harry Potter", "type": "text" }, "event_type": { @@ -28835,7 +28540,7 @@ "data": { "event": { "delta": { - "text": " used to get the answer.", + "text": " series", "type": "text" }, "event_type": { @@ -28855,72 +28560,56 @@ "data": { "event": { "delta": { - "text": "", + "text": ".", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { "metric": "prompt_tokens", - "span_id": "pzQMKAJc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809816+00:00", - "__module__": "datetime" - }, - "trace_id": "018KkGcOThSSiZfE", - "type": "metric", - "unit": "tokens", - "value": 97 + "unit": null, + "value": 84 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "pzQMKAJc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809911+00:00", - "__module__": "datetime" - }, - "trace_id": "018KkGcOThSSiZfE", - "type": "metric", - "unit": "tokens", - "value": 39 + "unit": null, + "value": 51 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "pzQMKAJc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809922+00:00", - "__module__": "datetime" - }, - "trace_id": "018KkGcOThSSiZfE", - "type": "metric", - "unit": "tokens", - "value": 136 + "unit": null, + "value": 135 } ] } @@ -28928,7 +28617,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28956,7 +28645,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -28976,7 +28665,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -28996,7 +28685,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": " recognized.", "type": "text" }, "event_type": { @@ -29010,42 +28699,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "328cb19d-47bb-47cc-8258-a5ca2e26803e", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -29074,16 +28727,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "dS0bhfN_", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324788+00:00", + "__datetime__": "2025-03-07T01:45:55.401637+00:00", "__module__": "datetime" }, - "trace_id": "UJz5Cas1SDyQYeBk", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 37 + "value": 93 }, { "attributes": { @@ -29091,16 +28744,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "dS0bhfN_", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324835+00:00", + "__datetime__": "2025-03-07T01:45:55.401666+00:00", "__module__": "datetime" }, - "trace_id": "UJz5Cas1SDyQYeBk", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 28 + "value": 20 }, { "attributes": { @@ -29108,16 +28761,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "dS0bhfN_", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324844+00:00", + "__datetime__": "2025-03-07T01:45:55.401670+00:00", "__module__": "datetime" }, - "trace_id": "UJz5Cas1SDyQYeBk", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 65 + "value": 113 } ] } @@ -29125,7 +28778,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29153,7 +28806,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -29173,7 +28826,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point_with_metadata", + "text": " function call", "type": "text" }, "event_type": { @@ -29193,7 +28846,7 @@ "data": { "event": { "delta": { - "text": "(liquid_name='polyjuice', cel", + "text": " should be", "type": "text" }, "event_type": { @@ -29213,7 +28866,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": " [get", "type": "text" }, "event_type": { @@ -29233,20 +28886,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "5bb48d00-7d5c-49e2-bddf-e5fdc5f35485", - "tool_name": "get_boiling_point_with_metadata" - }, - "type": "tool_call" + "text": "_boiling", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29254,11 +28895,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -29269,94 +28906,53 @@ "data": { "event": { "delta": { - "text": "", + "text": "_point_with", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "mfrFN7m2", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136501+00:00", - "__module__": "datetime" - }, - "trace_id": "T4eddr4-SMWPQwKA", - "type": "metric", - "unit": "tokens", - "value": 37 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_metadata(", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "mfrFN7m2", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136529+00:00", - "__module__": "datetime" - }, - "trace_id": "T4eddr4-SMWPQwKA", - "type": "metric", - "unit": "tokens", - "value": 30 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "mfrFN7m2", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136535+00:00", - "__module__": "datetime" - }, - "trace_id": "T4eddr4-SMWPQwKA", - "type": "metric", - "unit": "tokens", - "value": 67 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "liquid_name", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -29370,7 +28966,7 @@ "data": { "event": { "delta": { - "text": "When", + "text": "=\"poly", "type": "text" }, "event_type": { @@ -29390,7 +28986,7 @@ "data": { "event": { "delta": { - "text": " I answered the", + "text": "juice", "type": "text" }, "event_type": { @@ -29410,7 +29006,7 @@ "data": { "event": { "delta": { - "text": " phone, the friendly", + "text": "\", cel", "type": "text" }, "event_type": { @@ -29430,7 +29026,7 @@ "data": { "event": { "delta": { - "text": " voice on the other end said \"hello\"", + "text": "cius", "type": "text" }, "event_type": { @@ -29450,7 +29046,7 @@ "data": { "event": { "delta": { - "text": " and asked how I was doing.", + "text": "=True)]", "type": "text" }, "event_type": { @@ -29487,55 +29083,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "tJEuRhla", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044284+00:00", - "__module__": "datetime" - }, - "trace_id": "bnDS7Z41TRO0UyfH", - "type": "metric", - "unit": "tokens", - "value": 30 + "unit": null, + "value": 86 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "tJEuRhla", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044312+00:00", - "__module__": "datetime" - }, - "trace_id": "bnDS7Z41TRO0UyfH", - "type": "metric", - "unit": "tokens", - "value": 34 + "unit": null, + "value": 35 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "tJEuRhla", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044318+00:00", - "__module__": "datetime" - }, - "trace_id": "bnDS7Z41TRO0UyfH", - "type": "metric", - "unit": "tokens", - "value": 64 + "unit": null, + "value": 121 } ] } @@ -29543,7 +29103,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29571,47 +29131,7 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " am not able", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to execute this task as", + "text": "The", "type": "text" }, "event_type": { @@ -29631,7 +29151,7 @@ "data": { "event": { "delta": { - "text": " it exceeds the", + "text": " function get_boiling_point_with_metadata(", "type": "text" }, "event_type": { @@ -29651,7 +29171,7 @@ "data": { "event": { "delta": { - "text": " limitations of the functions I", + "text": "liquid_name=\"polyjuice\", celcius=True) should be", "type": "text" }, "event_type": { @@ -29671,7 +29191,7 @@ "data": { "event": { "delta": { - "text": " have been given.", + "text": " used to get the answer.", "type": "text" }, "event_type": { @@ -29713,16 +29233,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "5If5go-q", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__datetime__": "2025-03-07T01:45:56.809816+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 433 + "value": 97 }, { "attributes": { @@ -29730,16 +29250,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "5If5go-q", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__datetime__": "2025-03-07T01:45:56.809911+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 31 + "value": 39 }, { "attributes": { @@ -29747,16 +29267,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "5If5go-q", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__datetime__": "2025-03-07T01:45:56.809922+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 464 + "value": 136 } ] } @@ -29764,7 +29284,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29792,38 +29312,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n# Load data\ndf =", - "type": "tool_call" + "text": "[", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29842,13 +29332,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " pd.read_csv(\"/var/folders/rb/qv8vwgyj", - "type": "tool_call" + "text": "get_bo", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29867,13 +29352,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "6yjd3t4pwsy9t0rm0000", - "type": "tool_call" + "text": "iling_point", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29892,13 +29372,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gn/T/tmp2x_sml66/ZEjbinQHin", - "type": "tool_call" + "text": "(liquid", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29917,13 +29392,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation.csv\")\n# Rows\nprint(\"Number of rows and columns in the", - "type": "tool_call" + "text": "_name='", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29942,13 +29412,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", - "type": "tool_call" + "text": "polyju", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29967,13 +29432,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data", - "type": "tool_call" + "text": "ice',", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29992,13 +29452,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df", - "type": "tool_call" + "text": " celci", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30017,13 +29472,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\n", - "type": "tool_call" + "text": "us=True", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30042,13 +29492,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(df.head())", - "type": "tool_call" + "text": ")]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30074,14 +29519,11 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + "celcius": true, + "liquid_name": "polyjuice" }, - "call_id": "1df8b196-9eff-4b06-97e7-ab175c741e8f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } + "call_id": "eafd0169-6523-4625-8d5f-25e902b70abb", + "tool_name": "get_boiling_point" }, "type": "tool_call" }, @@ -30123,55 +29565,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "fLqIbpek", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262304+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 235 + "unit": null, + "value": 37 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "fLqIbpek", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262340+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 10 + "unit": null, + "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "fLqIbpek", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262347+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 245 + "unit": null, + "value": 65 } ] } @@ -30179,7 +29585,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -30207,13 +29613,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "[", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30232,13 +29633,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n# Load data\ndf = pd", - "type": "tool_call" + "text": "get_bo", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30257,13 +29653,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4", - "type": "tool_call" + "text": "iling_point", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30282,13 +29673,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "pwsy9t0rm0000gn/T/tmp2x_sml66/ZEj", - "type": "tool_call" + "text": "_with_metadata", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30307,13 +29693,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "binQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data", - "type": "tool_call" + "text": "(liquid", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30332,13 +29713,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n#", - "type": "tool_call" + "text": "_name='", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30357,13 +29733,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint", - "type": "tool_call" + "text": "polyju", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30382,13 +29753,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(\"Datatype of the columns are:\", df.dtypes)\n# Sample", - "type": "tool_call" + "text": "ice',", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30407,13 +29773,48 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " of data\nprint(\"Data sample from file:\")\nprint(df.head())", - "type": "tool_call" + "text": " celci", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "us=True", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ")]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30439,14 +29840,11 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + "celcius": true, + "liquid_name": "polyjuice" }, - "call_id": "c1708ded-f272-4008-b91f-19d61780c394", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } + "call_id": "71898ef2-4a6d-4131-ac62-281c8fb5d29c", + "tool_name": "get_boiling_point_with_metadata" }, "type": "tool_call" }, @@ -30488,55 +29886,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "KTMayjIE", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:37.305765+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 37 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "KTMayjIE", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:37.305820+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 10 + "unit": null, + "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "KTMayjIE", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:37.305832+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 47 + "unit": null, + "value": 67 } ] } @@ -30544,7 +29906,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -30572,47 +29934,7 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", + "text": "When", "type": "text" }, "event_type": { @@ -30632,7 +29954,7 @@ "data": { "event": { "delta": { - "text": "rm0000gn/T/tmp2x_sml66/9vY", + "text": " I answered", "type": "text" }, "event_type": { @@ -30652,7 +29974,7 @@ "data": { "event": { "delta": { - "text": "vmVRoinflation.csv\" does not exist. This could be due to", + "text": " the phone", "type": "text" }, "event_type": { @@ -30672,7 +29994,7 @@ "data": { "event": { "delta": { - "text": " a variety of reasons such as the file being deleted, the path being incorrect", + "text": ",", "type": "text" }, "event_type": { @@ -30692,7 +30014,7 @@ "data": { "event": { "delta": { - "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", + "text": " the", "type": "text" }, "event_type": { @@ -30712,7 +30034,7 @@ "data": { "event": { "delta": { - "text": " try the following:\n\n1. Check the file path: Ensure that the file", + "text": " friendly voice", "type": "text" }, "event_type": { @@ -30732,7 +30054,7 @@ "data": { "event": { "delta": { - "text": " path is correct and the file exists at that location.\n2. Check file permissions:", + "text": " on the", "type": "text" }, "event_type": { @@ -30752,7 +30074,7 @@ "data": { "event": { "delta": { - "text": " Ensure that the file is accessible and you have the necessary permissions to", + "text": " other", "type": "text" }, "event_type": { @@ -30772,7 +30094,7 @@ "data": { "event": { "delta": { - "text": " read it.\n3. Try a different file: If the file is not", + "text": " end", "type": "text" }, "event_type": { @@ -30792,7 +30114,7 @@ "data": { "event": { "delta": { - "text": " accessible, try loading a different file to see if the issue is specific to", + "text": " said \"", "type": "text" }, "event_type": { @@ -30812,7 +30134,7 @@ "data": { "event": { "delta": { - "text": " this file or a general issue with your code.\n4. Check for ty", + "text": "hello\"", "type": "text" }, "event_type": { @@ -30832,7 +30154,7 @@ "data": { "event": { "delta": { - "text": "pos: Ensure that there are no typos in the file path or the", + "text": " and", "type": "text" }, "event_type": { @@ -30852,7 +30174,7 @@ "data": { "event": { "delta": { - "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", + "text": " asked", "type": "text" }, "event_type": { @@ -30872,7 +30194,7 @@ "data": { "event": { "delta": { - "text": " you are using, and I'll be happy to help further.", + "text": " how I was doing.", "type": "text" }, "event_type": { @@ -30909,55 +30231,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262530+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 680 + "unit": null, + "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262555+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 238 + "unit": null, + "value": 34 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262558+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 918 + "unit": null, + "value": 64 } ] } @@ -30965,7 +30251,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -30993,13 +30279,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "I", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31018,13 +30299,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" + "text": " am not able", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31043,13 +30319,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" + "text": " to execute this task as", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31068,12 +30339,25804 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "text": " it exceeds the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " limitations of the functions I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " have been given.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 433 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 31 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 464 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " indicates that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 'b", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "wrap'", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " was", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not found", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is likely", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " because the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path provided", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is incorrect", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " or the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " does", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not exist", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " current working", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " directory.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "To resolve", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this issue", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " can", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " try", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the following", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n\n1", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Check", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path provided", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file exists", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " specified", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n2", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Use", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file path", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is located", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " different", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " directory", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", provide", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Check the file name:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " name", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " matches the one", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " provided in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code.\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Use the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " absolute file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Instead of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " using a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " relative file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " try using", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the absolute", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file path", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "If you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " are still", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " encountering issues", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ",", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " please", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " provide more", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " details about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and its", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'ll be", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " happy to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " assist you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " further", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 236 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 183 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 419 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " as pd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Load data", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\ndf", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " = pd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".read_csv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"/var", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/f", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "olders/r", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "b/q", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "v", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "qy", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "vgy", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "j", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "6", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "yjd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "3t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "4p", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "ws", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0000", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmp", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "xgx", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "j", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "70", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y_/", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "4NO", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0CF", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "URin", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.csv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\")\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Rows\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(\"", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Number of", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " rows and", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " columns in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the data", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".shape)\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data are", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", len", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df.columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "))\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Column names", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data are", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".columns)\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Column", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " dtypes", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Dat", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "atype of", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\",", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " df.d", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "types)\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Sample", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of data", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Data", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " sample from", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file:\")\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".head())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qvqyvgyj6yjd3t4pwsy9t0rm0000gn/T/tmpxgxj70y_/4NO0CFURinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "923b4193-8e2e-4b28-b55d-3d0e6e9a3b90", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vwgyj6yjd3t4pwsy9t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm0000gn/T/tmp2x_sml66/9vY", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vmVRoinflation.csv\" does not exist. This could be due to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a variety of reasons such as the file being deleted, the path being incorrect", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " try the following:\n\n1. Check the file path: Ensure that the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path is correct and the file exists at that location.\n2. Check file permissions:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Ensure that the file is accessible and you have the necessary permissions to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " read it.\n3. Try a different file: If the file is not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " accessible, try loading a different file to see if the issue is specific to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this file or a general issue with your code.\n4. Check for ty", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "pos: Ensure that there are no typos in the file path or the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you are using, and I'll be happy to help further.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262530+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 680 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262555+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 238 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262558+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 918 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6yjd3t4pwsy9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'], format='%Y')\n\n# Group by", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation as a time series\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Inflation'], marker='o')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953806+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 432 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953843+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953847+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 442 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check the file name: Make sure the file name is correct and it matches the one you are trying to load.\\n3. Check the file format: Make sure the file is in the correct format (CSV) and it is not corrupted.\\n4. Try a different file: If the file is not available, try loading a different file to see if the issue is specific to this file or not.\\n\\nIf you are still having trouble, please provide more information about the file and the error message you are receiving, and I will do my best to assist you.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " indicates that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"/var", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/folders", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/rb", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/qv", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "8vw", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "gyj", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "6y", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "jd3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "t4", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "p", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "wsy", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "9t", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "0rm", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "0000", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "gn/T", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/tmpx", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "gxj", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "70y", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_/Z", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "qZ", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "39W", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "iyin", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "flation.csv", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\" does", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not exist", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " could be", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " due to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " number of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " such as", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being deleted", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " or moved", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", or", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path being", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " incorrect.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "To resolve", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this issue", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " can try", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the following", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n\n1", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Check", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Make sure", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " exists at", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " that location", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "2", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Check", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " name:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Make sure", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " name is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it matches", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " one you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " are trying", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " load", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Check", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file format", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": Make", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " sure", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " format (", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "CSV)", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " corrupted.\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Try a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " different file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " available,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " try loading", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a different", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " see if", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " specific to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file or", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "If you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " are still", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " having trouble", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ",", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " please provide", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " more information", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " about the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the error", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " message you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " are receiving", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " I will", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " do my", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " best to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " assist you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 732 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 239 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 971 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check the file name: Make sure the file name is correct and it matches the one you are trying to load.\\n3. Check the file format: Make sure the file is in the correct format (CSV) and it is not corrupted.\\n4. Try a different file: If the file is not available, try loading a different file to see if the issue is specific to this file or not.\\n\\nIf you are still having trouble, please provide more information about the file and the error message you are receiving, and I will do my best to assist you.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " as pd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import matplotlib", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".pyplot as", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " plt\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Load", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the CSV", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df =", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " pd.read", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "var/f", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "olders/r", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "b/q", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "v8", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "vw", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gy", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "j6", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "yjd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "3t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "4p", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "ws", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0000", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/T", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmpx", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gxj", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "70y", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_/Z", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "qZ", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "39W", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "iyin", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.csv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\")\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Convert", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the '", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column to", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " datetime\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year']", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " = pd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".to_datetime", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'],", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " format='%", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Y')\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Group", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " by '", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " and calculate", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the average", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " inflation\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df_avg", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " = df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".groupby('", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year')['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'].mean", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "().", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "reset", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_index()\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the average", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " as", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " a time", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " series\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".figure(figsize", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "=(10", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ",6", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "))\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".plot(df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'],", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " df_avg", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "In", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation'], marker='o", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Average", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Yearly", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".xlabel('", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year')\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.ylabel('Inflation')\nplt.grid(True)\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.show", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpxgxj70y_/ZqZ39Wiyinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "48277e37-1992-4510-9751-9895707cb190", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 484 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 494 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " due to a variety of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons such as the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being deleted, the path being incorrect, or the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not being accessible.\n\nTo resolve this issue, you can try", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the following:\n\n1. Check the file path: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file path is correct and the file exists at that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location.\n2. Check file permissions: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file is accessible and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you have the necessary permissions to read", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it.\n3. Try a different file: If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file is not accessible, try loading a different file to see", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " if the issue is specific to this file or a general", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue with your code.\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4. Check for typos: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " there are no typos in the file path or the code.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "If you are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " still having issues, please provide more details about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file and the code you are using", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", and I'll be happy to help further.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "KwfNrQLy", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:19.630894+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 192 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "KwfNrQLy", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:19.630987+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 238 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "KwfNrQLy", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:19.630996+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 430 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " indicates that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"/", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "var/f", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "olders/r", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "b/q", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "v8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vwgy", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "j6", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "yjd", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "3t", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4p", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "wsy", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "9", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm000", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "0gn", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/T/tmp", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "xgx", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "j70", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "y_/", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Zq", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Z39", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Wiy", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "inflation", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".csv", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\" does", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not exist", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". This", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " could be", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " due to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " number of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " such as", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being deleted", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " or", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " moved,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " or the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path being", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " incorrect.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "To resolve", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this issue", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " can try", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the following", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n\n1", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Check the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file path", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": Make", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " sure the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file path", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file exists", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " at that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "2.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Check the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file name", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": Make", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " sure the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file name", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " matches the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " one you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " are trying", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to load", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Check", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " format:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Make sure", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the correct format (CSV)", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and it is not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " corrupted.\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Try a", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " different file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " available,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " try loading", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a different", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " see if", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " specific to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " or not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".\n\nIf", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " still having", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " trouble,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " please provide", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " more information", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " about the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the error", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " message", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " receiving,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " will", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " do my", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " best to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " assist you", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 243 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 239 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 482 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " as pd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Load the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " CSV file", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df =", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " pd.read", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "var/f", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "olders/r", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "b/q", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "v8", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "vwgy", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "j6", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "yjd", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "3", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "4p", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "wsy", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "000", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmpx", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gxj70y_/ZqZ39", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Wiy", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "inflation", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".csv\")\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Print", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the first", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " few rows", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " dataframe\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".head())\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Print", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the shape", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " dataframe (", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "number", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of rows", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " and columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ")\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df.shape", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ")\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Print the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column names", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df.columns", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ")\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Print the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data types", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of each", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(df", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".dtypes", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ")\n\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Print a", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " summary of", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " dataframe (", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "count,", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " mean,", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " std", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ",", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " min,", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 25", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "%, ", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "50%,", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 75", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "%, max", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ")\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df.describe", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpxgxj70y_/ZqZ39Wiyinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the shape of the dataframe (number of rows and columns)\nprint(df.shape)\n\n# Print the column names\nprint(df.columns)\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\nprint(df.describe())" + }, + "call_id": "c3783091-f1cf-49e0-bc3b-a827618dcbe0", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 36 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 46 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search(query=\"using LoRA in Torchtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673350+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 107 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673375+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673381+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 130 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help. What's", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " your question about Torchtune?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179269+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 75 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179301+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 25 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179308+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 100 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search(query=\"using LoRA in Torchtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209198+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 108 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209239+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209247+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 131 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help. What's", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " your first question about Torchtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525734+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 75 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525763+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 26 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525770+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 101 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:89553\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:122a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:700ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:89553\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:122a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "using Lo", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "RA in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torcht", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "une\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "0d7b903c-c045-4c45-88d8-1200d809b37a", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 107 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 130 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:89553\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:122a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to help", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". What", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'s your", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " question", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torcht", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "une?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 25 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 100 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b065e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c96ea\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fe9fc\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b065e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c96ea\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "using Lo", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "RA in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torcht", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "une", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "2bfd3d0d-5172-4031-af00-fd6c456e4fd9", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 107 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 130 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b065e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c96ea\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help. What", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'s your", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " question about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Tor", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "chtune", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 25 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 100 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Torcht", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "une documentation", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Torchtune documentation" + }, + "call_id": "70cd0350-8689-4bb5-a0bf-2a9d2112d08d", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 39 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 20 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 59 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "L", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "lama3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "-", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "B uses", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " grouped-query", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " attention instead", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " standard", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " multi-head", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " attention.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 28 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "L", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "lama3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "-8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "B uses", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " grouped", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "-query attention", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " instead of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the standard", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " multi", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "-head attention", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 28 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Llama", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "3-", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "8B", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " attention type", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "d7f3056e-3c5c-4bbc-869e-b617a35bdbb4", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 24 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 64 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nowledge_search", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "(query=\"", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "L", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "lama3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "-8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "B attention", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " type\")]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "cd89b59f-7caa-4669-85c8-df6ba3892e77", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 24 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 64 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79080546, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05570498, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " current CEO of Meta is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Mark", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Zuckerberg.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 1235 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 19 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 1254 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " current CEO of Meta is Mark Zuckerberg.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" }, - "tool_call": "8vwgyj6yjd3t4pwsy9t", + "metric": "prompt_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084924+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1145 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084934+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 19 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084936+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1164 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "brave", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_search.call", "type": "tool_call" }, "event_type": { @@ -31098,7 +56161,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", + "tool_call": "(query=\"", "type": "tool_call" }, "event_type": { @@ -31123,7 +56186,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", + "tool_call": "current", "type": "tool_call" }, "event_type": { @@ -31148,7 +56211,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", + "tool_call": " CEO of", "type": "tool_call" }, "event_type": { @@ -31173,7 +56236,42 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "'], format='%Y')\n\n# Group by", + "tool_call": " Meta\")", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "current CEO of Meta" + }, + "call_id": "112fe886-dc25-4347-9b54-b52571b0cdb5", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } + }, "type": "tool_call" }, "event_type": { @@ -31182,6 +56280,235 @@ "value": "progress" }, "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 34 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 44 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " boiling point", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of poly", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "ju", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "ice", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is -", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "100 degrees", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Celsius.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, "stop_reason": null }, "metrics": null @@ -31193,43 +56520,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 77 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 100 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31243,13 +56585,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation as a time series\n", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31268,13 +56605,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", - "type": "tool_call" + "text": " boiling point", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31293,13 +56625,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'], df_avg_in", - "type": "tool_call" + "text": " of poly", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31318,13 +56645,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['Inflation'], marker='o')\nplt", - "type": "tool_call" + "text": "juice", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31343,13 +56665,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", - "type": "tool_call" + "text": " is -", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31368,13 +56685,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", - "type": "tool_call" + "text": "100 degrees", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31393,23 +56705,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " Celsius.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31417,11 +56714,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -31449,55 +56742,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953806+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 432 + "unit": null, + "value": 77 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953843+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 10 + "unit": null, + "value": 23 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953847+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 442 + "unit": null, + "value": 100 } ] } @@ -31505,7 +56762,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -31553,7 +56810,7 @@ "data": { "event": { "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", + "text": " boiling point", "type": "text" }, "event_type": { @@ -31573,7 +56830,7 @@ "data": { "event": { "delta": { - "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", + "text": " of poly", "type": "text" }, "event_type": { @@ -31593,7 +56850,7 @@ "data": { "event": { "delta": { - "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", + "text": "juice", "type": "text" }, "event_type": { @@ -31613,7 +56870,7 @@ "data": { "event": { "delta": { - "text": " due to a variety of", + "text": " is", "type": "text" }, "event_type": { @@ -31633,7 +56890,47 @@ "data": { "event": { "delta": { - "text": " reasons such as the file", + "text": " -", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "100 degrees", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Celsius.", "type": "text" }, "event_type": { @@ -31653,13 +56950,58 @@ "data": { "event": { "delta": { - "text": " being deleted, the path being incorrect, or the file", + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 77 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 100 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31673,7 +57015,7 @@ "data": { "event": { "delta": { - "text": " not being accessible.\n\nTo resolve this issue, you can try", + "text": "The", "type": "text" }, "event_type": { @@ -31693,7 +57035,7 @@ "data": { "event": { "delta": { - "text": " the following:\n\n1. Check the file path: Ensure that", + "text": " function call should be", "type": "text" }, "event_type": { @@ -31713,7 +57055,7 @@ "data": { "event": { "delta": { - "text": " the file path is correct and the file exists at that", + "text": ":\n[", "type": "text" }, "event_type": { @@ -31733,7 +57075,7 @@ "data": { "event": { "delta": { - "text": " location.\n2. Check file permissions: Ensure that", + "text": "get", "type": "text" }, "event_type": { @@ -31753,7 +57095,7 @@ "data": { "event": { "delta": { - "text": " the file is accessible and", + "text": "_boiling_point(liquid_name='polyjuice', celci", "type": "text" }, "event_type": { @@ -31773,7 +57115,7 @@ "data": { "event": { "delta": { - "text": " you have the necessary permissions to read", + "text": "us=True)]", "type": "text" }, "event_type": { @@ -31793,33 +57135,94 @@ "data": { "event": { "delta": { - "text": " it.\n3. Try a different file: If", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473221+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 86 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473254+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 34 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473261+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 120 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " the file is not accessible, try loading a different file to see", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31833,7 +57236,7 @@ "data": { "event": { "delta": { - "text": " if the issue is specific to this file or a general", + "text": "The", "type": "text" }, "event_type": { @@ -31853,7 +57256,7 @@ "data": { "event": { "delta": { - "text": " issue with your code.\n", + "text": " function `get_boiling_point`", "type": "text" }, "event_type": { @@ -31873,7 +57276,7 @@ "data": { "event": { "delta": { - "text": "4. Check for typos: Ensure that", + "text": " is not a real function and cannot be", "type": "text" }, "event_type": { @@ -31893,7 +57296,7 @@ "data": { "event": { "delta": { - "text": " there are no typos in the file path or the code.\n\n", + "text": " used to determine the boiling point of polyju", "type": "text" }, "event_type": { @@ -31913,7 +57316,7 @@ "data": { "event": { "delta": { - "text": "If you are", + "text": "ice. Polyjuice is a fictional substance from the", "type": "text" }, "event_type": { @@ -31933,7 +57336,7 @@ "data": { "event": { "delta": { - "text": " still having issues, please provide more details about", + "text": " Harry Potter series and does not have a real-world boiling", "type": "text" }, "event_type": { @@ -31953,7 +57356,7 @@ "data": { "event": { "delta": { - "text": " the file and the code you are using", + "text": " point. If you have any other questions or need help", "type": "text" }, "event_type": { @@ -31973,7 +57376,7 @@ "data": { "event": { "delta": { - "text": ", and I'll be happy to help further.", + "text": " with a different topic, feel free to ask!", "type": "text" }, "event_type": { @@ -32015,16 +57418,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "KwfNrQLy", + "span_id": "aCPTIc0d", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630894+00:00", + "__datetime__": "2025-03-07T01:53:27.227208+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "4DRyVE86RpCeqfpE", "type": "metric", "unit": "tokens", - "value": 192 + "value": 86 }, { "attributes": { @@ -32032,16 +57435,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "KwfNrQLy", + "span_id": "aCPTIc0d", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630987+00:00", + "__datetime__": "2025-03-07T01:53:27.227251+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "4DRyVE86RpCeqfpE", "type": "metric", "unit": "tokens", - "value": 238 + "value": 78 }, { "attributes": { @@ -32049,16 +57452,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "KwfNrQLy", + "span_id": "aCPTIc0d", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630996+00:00", + "__datetime__": "2025-03-07T01:53:27.227258+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "4DRyVE86RpCeqfpE", "type": "metric", "unit": "tokens", - "value": 430 + "value": 164 } ] } @@ -32066,7 +57469,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -32094,113 +57497,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "jd3t4pwsy9t0rm0000gn/T", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmp2x_sml66/9vYvmVRoinflation.csv", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32219,13 +57517,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n", - "type": "tool_call" + "text": " function call should be in the following format", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32240,17 +57533,12 @@ }, { "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print information about the dataframe\nprint(df", - "type": "tool_call" + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": [function_name(parameters)]. However", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32269,13 +57557,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".info())\n\n# Print summary statistics about the dataframe\nprint(df.describe", - "type": "tool_call" + "text": ", the function get_boiling_point is not recognized", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32294,13 +57577,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "())", - "type": "tool_call" + "text": ". If the function", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32319,23 +57597,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())" - }, - "call_id": "5bbfebeb-4360-4ef9-a9e2-4227a8e8c699", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " is supposed to return the boiling point of a liquid, it should be defined", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32343,11 +57606,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -32358,94 +57617,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " before it can be used. \n\nIn this", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "AyEX3So6", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873486+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 36 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "AyEX3So6", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873500+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 10 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "AyEX3So6", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873503+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 46 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "ice',", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -33040,7 +58258,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " celci", "type": "text" }, "event_type": { @@ -33060,7 +58278,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", + "text": "us=True", "type": "text" }, "event_type": { @@ -33080,7 +58298,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": ")]", "type": "text" }, "event_type": { @@ -33107,10 +58325,11 @@ }, "tool_call": { "arguments": { - "query": "using LoRA in Torchtune" + "celcius": true, + "liquid_name": "polyjuice" }, - "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", - "tool_name": "knowledge_search" + "call_id": "dcf85480-0aa4-4f86-9720-6c030aa67344", + "tool_name": "get_boiling_point" }, "type": "tool_call" }, @@ -33152,55 +58371,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673350+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 107 + "unit": null, + "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673375+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 23 + "unit": null, + "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673381+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 130 + "unit": null, + "value": 58 } ] } @@ -33208,7 +58391,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -33236,7 +58419,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "[", "type": "text" }, "event_type": { @@ -33256,7 +58439,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What's", + "text": "get_bo", "type": "text" }, "event_type": { @@ -33276,7 +58459,7 @@ "data": { "event": { "delta": { - "text": " your question about Torchtune?", + "text": "iling", "type": "text" }, "event_type": { @@ -33296,94 +58479,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "_point", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179269+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179301+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 25 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179308+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 100 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " boiling", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34018,7 +59120,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " point.", "type": "text" }, "event_type": { @@ -34038,7 +59140,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", + "text": " Poly", "type": "text" }, "event_type": { @@ -34058,7 +59160,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": "juice", "type": "text" }, "event_type": { @@ -34078,19 +59180,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " Potion", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34098,11 +59189,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -34113,94 +59200,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " is a", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209198+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 108 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209239+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209247+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 131 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " magical", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34214,7 +59240,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": " concoction", "type": "text" }, "event_type": { @@ -34234,7 +59260,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What's", + "text": " that allows", "type": "text" }, "event_type": { @@ -34254,7 +59280,7 @@ "data": { "event": { "delta": { - "text": " your first question about Torchtune", + "text": " the drink", "type": "text" }, "event_type": { @@ -34274,7 +59300,7 @@ "data": { "event": { "delta": { - "text": "?", + "text": "er to", "type": "text" }, "event_type": { @@ -34294,94 +59320,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " assume the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525734+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525763+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 26 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525770+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 101 - } - ] + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " form", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34395,7 +59360,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " and appearance", "type": "text" }, "event_type": { @@ -34415,7 +59380,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Tor", + "text": " of", "type": "text" }, "event_type": { @@ -34435,7 +59400,7 @@ "data": { "event": { "delta": { - "text": "chtune documentation\")]", + "text": " another person", "type": "text" }, "event_type": { @@ -34455,19 +59420,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Torchtune documentation" - }, - "call_id": "385cbde8-19e8-4c8b-84ca-b75050b3666b", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": ", but", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34475,11 +59429,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -34490,94 +59440,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " it's", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "-7YS2sLl", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668846+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 39 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "-7YS2sLl", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668859+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 20 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "-7YS2sLl", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668861+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 59 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " not a", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34591,7 +59480,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": " physical substance", "type": "text" }, "event_type": { @@ -34611,7 +59500,7 @@ "data": { "event": { "delta": { - "text": "lama3-8B uses grouped-query", + "text": " that", "type": "text" }, "event_type": { @@ -34631,7 +59520,7 @@ "data": { "event": { "delta": { - "text": " attention instead of", + "text": " can be", "type": "text" }, "event_type": { @@ -34651,7 +59540,7 @@ "data": { "event": { "delta": { - "text": " the standard multi-head attention.", + "text": " measured or", "type": "text" }, "event_type": { @@ -34671,94 +59560,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " analyzed in", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.982970+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 80 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.983000+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.983005+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 108 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " the same", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34772,7 +59600,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": " way as", "type": "text" }, "event_type": { @@ -34792,7 +59620,7 @@ "data": { "event": { "delta": { - "text": "lama3-8B uses grouped-query attention instead of", + "text": " real-world", "type": "text" }, "event_type": { @@ -34812,7 +59640,7 @@ "data": { "event": { "delta": { - "text": " the standard", + "text": " chemicals.\n\n", "type": "text" }, "event_type": { @@ -34832,7 +59660,7 @@ "data": { "event": { "delta": { - "text": " multi-head attention.", + "text": "If you", "type": "text" }, "event_type": { @@ -34852,94 +59680,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " have any", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884663+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 80 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884753+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884760+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 108 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " other questions", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -34953,7 +59720,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " or if", "type": "text" }, "event_type": { @@ -34973,7 +59740,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Llama3-8", + "text": " there", "type": "text" }, "event_type": { @@ -34993,7 +59760,7 @@ "data": { "event": { "delta": { - "text": "B attention type\")]", + "text": "'s anything", "type": "text" }, "event_type": { @@ -35013,19 +59780,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "4901bbdf-8faf-4a57-b6f6-01688c6290e6", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " else I", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35033,11 +59789,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -35048,94 +59800,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " can help", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412559+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 40 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412607+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 24 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412615+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 64 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " you", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -35149,7 +59840,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " with,", "type": "text" }, "event_type": { @@ -35169,7 +59860,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Llama3-8B attention", + "text": " feel free", "type": "text" }, "event_type": { @@ -35189,7 +59880,7 @@ "data": { "event": { "delta": { - "text": " type\")]", + "text": " to ask", "type": "text" }, "event_type": { @@ -35209,19 +59900,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "dd056386-b105-47e5-bd85-07e5ae096de1", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": "!", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35229,11 +59909,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -35261,55 +59937,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "yjKrmpeo", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041566+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 40 + "unit": null, + "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "yjKrmpeo", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041591+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 24 + "unit": null, + "value": 113 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "yjKrmpeo", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041597+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", - "value": 64 + "unit": null, + "value": 143 } ] } @@ -35317,7 +59957,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35345,7 +59985,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[", "type": "text" }, "event_type": { @@ -35365,7 +60005,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", + "text": "get_bo", "type": "text" }, "event_type": { @@ -35385,94 +60025,53 @@ "data": { "event": { "delta": { - "text": "", + "text": "iling_point(liquid_name='poly", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084924+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 1145 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "ju", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084934+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 19 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084936+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 1164 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "ice', cel", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -35486,13 +60085,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "cius", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35511,13 +60105,28 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "brave_search.call(query=\"current CEO of Meta\")", - "type": "tool_call" + "text": "=True", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ")]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35543,14 +60152,11 @@ }, "tool_call": { "arguments": { - "query": "current CEO of Meta" + "celcius": true, + "liquid_name": "polyjuice" }, - "call_id": "535c272b-768b-44fe-b303-2eae022f67f5", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "brave_search" - } + "call_id": "81e4629c-9ed7-4fda-b0fd-8db41cd00407", + "tool_name": "get_boiling_point" }, "type": "tool_call" }, @@ -35592,55 +60198,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "AZ60Ocso", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907918+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 34 + "unit": null, + "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "AZ60Ocso", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907933+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 10 + "unit": null, + "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "AZ60Ocso", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907936+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 44 + "unit": null, + "value": 58 } ] } @@ -35648,7 +60218,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35696,7 +60266,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius", + "text": " 100th prime number is 541", "type": "text" }, "event_type": { @@ -35758,16 +60328,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "drZjZkfj", + "span_id": "bxIams_G", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852666+00:00", + "__datetime__": "2025-03-07T01:44:13.404182+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "snO106yxStaL10ow", "type": "metric", "unit": "tokens", - "value": 77 + "value": 252 }, { "attributes": { @@ -35775,16 +60345,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "drZjZkfj", + "span_id": "bxIams_G", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852692+00:00", + "__datetime__": "2025-03-07T01:44:13.404224+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "snO106yxStaL10ow", "type": "metric", "unit": "tokens", - "value": 23 + "value": 20 }, { "attributes": { @@ -35792,16 +60362,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "drZjZkfj", + "span_id": "bxIams_G", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852699+00:00", + "__datetime__": "2025-03-07T01:44:13.404230+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "snO106yxStaL10ow", "type": "metric", "unit": "tokens", - "value": 100 + "value": 272 } ] } @@ -35809,7 +60379,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n == 2:\\n return True\\n if n % 2 == 0:\\n return False\\n max_divisor = int(n**0.5) + 1\\n for d in range(3, max_divisor, 2):\\n if n % d == 0:\\n return False\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35857,7 +60427,87 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius.", + "text": " 100", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "th prime", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " number is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " 541", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", "type": "text" }, "event_type": { @@ -35894,55 +60544,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.617998+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 77 + "unit": null, + "value": 243 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.618030+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 23 + "unit": null, + "value": 20 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.618036+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 100 + "unit": null, + "value": 263 } ] } @@ -35950,7 +60564,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35978,8 +60592,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35998,8 +60617,88 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "def is", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_prime(n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "):\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36018,8 +60717,13 @@ "data": { "event": { "delta": { - "text": " able", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36038,8 +60742,13 @@ "data": { "event": { "delta": { - "text": " to find the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " <= ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36058,8 +60767,13 @@ "data": { "event": { "delta": { - "text": " boiling point of \"polyjuice\" as", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "1", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36078,8 +60792,13 @@ "data": { "event": { "delta": { - "text": " it", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36098,8 +60817,13 @@ "data": { "event": { "delta": { - "text": " is not a real liquid", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return False", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36118,8 +60842,13 @@ "data": { "event": { "delta": { - "text": ". Polyju", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36138,8 +60867,13 @@ "data": { "event": { "delta": { - "text": "ice is a fictional substance from the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36158,8 +60892,13 @@ "data": { "event": { "delta": { - "text": " Harry Potter series.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " == ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36178,94 +60917,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "2:\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232189+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 77 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232325+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 51 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232334+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 128 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -36279,8 +60967,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return True", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36299,8 +60992,13 @@ "data": { "event": { "delta": { - "text": " function call should be", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36319,8 +61017,13 @@ "data": { "event": { "delta": { - "text": ":\n[", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36339,8 +61042,13 @@ "data": { "event": { "delta": { - "text": "get", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " % ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36359,8 +61067,13 @@ "data": { "event": { "delta": { - "text": "_boiling_point(liquid_name='polyjuice', celci", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "2 ==", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36379,8 +61092,13 @@ "data": { "event": { "delta": { - "text": "us=True)]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 0", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36399,94 +61117,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "JN7UZs_c", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473221+00:00", - "__module__": "datetime" - }, - "trace_id": "H3r-_Zh-TVqtSp7k", - "type": "metric", - "unit": "tokens", - "value": 86 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "JN7UZs_c", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473254+00:00", - "__module__": "datetime" - }, - "trace_id": "H3r-_Zh-TVqtSp7k", - "type": "metric", - "unit": "tokens", - "value": 34 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "JN7UZs_c", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473261+00:00", - "__module__": "datetime" - }, - "trace_id": "H3r-_Zh-TVqtSp7k", - "type": "metric", - "unit": "tokens", - "value": 120 - } - ] + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -36500,8 +61167,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " False", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36520,8 +61192,13 @@ "data": { "event": { "delta": { - "text": " function `get_boiling_point`", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36540,8 +61217,13 @@ "data": { "event": { "delta": { - "text": " is not a real function and cannot be", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " max_div", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36560,8 +61242,13 @@ "data": { "event": { "delta": { - "text": " used to determine the boiling point of polyju", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "isor", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36580,8 +61267,13 @@ "data": { "event": { "delta": { - "text": "ice. Polyjuice is a fictional substance from the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " =", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36600,8 +61292,13 @@ "data": { "event": { "delta": { - "text": " Harry Potter series and does not have a real-world boiling", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " int(n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36620,8 +61317,13 @@ "data": { "event": { "delta": { - "text": " point. If you have any other questions or need help", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "**0", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36640,8 +61342,13 @@ "data": { "event": { "delta": { - "text": " with a different topic, feel free to ask!", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36660,94 +61367,68 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "5)", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "aCPTIc0d", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227208+00:00", - "__module__": "datetime" + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" }, - "trace_id": "4DRyVE86RpCeqfpE", - "type": "metric", - "unit": "tokens", - "value": 86 + "tool_call": " + ", + "type": "tool_call" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "aCPTIc0d", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227251+00:00", - "__module__": "datetime" - }, - "trace_id": "4DRyVE86RpCeqfpE", - "type": "metric", - "unit": "tokens", - "value": 78 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "aCPTIc0d", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227258+00:00", - "__module__": "datetime" - }, - "trace_id": "4DRyVE86RpCeqfpE", - "type": "metric", - "unit": "tokens", - "value": 164 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "1\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -36761,8 +61442,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " for", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36781,8 +61467,13 @@ "data": { "event": { "delta": { - "text": " function call should be in the following format", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " d in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36801,8 +61492,13 @@ "data": { "event": { "delta": { - "text": ": [function_name(parameters)]. However", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " range(", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36821,8 +61517,13 @@ "data": { "event": { "delta": { - "text": ", the function get_boiling_point is not recognized", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "3,", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36841,8 +61542,13 @@ "data": { "event": { "delta": { - "text": ". If the function", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " max_div", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36861,8 +61567,13 @@ "data": { "event": { "delta": { - "text": " is supposed to return the boiling point of a liquid, it should be defined", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "isor,", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36881,8 +61592,13 @@ "data": { "event": { "delta": { - "text": " before it can be used. \n\nIn this", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 2", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36901,8 +61617,13 @@ "data": { "event": { "delta": { - "text": " case, I will assume that the function get_boiling_point is defined as", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "):\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36921,8 +61642,13 @@ "data": { "event": { "delta": { - "text": " follows:\ndef get", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36941,8 +61667,13 @@ "data": { "event": { "delta": { - "text": "_boiling_point(liquid_name, celcius=True):\n # This", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " % d", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36961,8 +61692,13 @@ "data": { "event": { "delta": { - "text": " function returns the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " == ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36981,8 +61717,13 @@ "data": { "event": { "delta": { - "text": " boiling point of a liquid in Celcius or Fahrenheit\n boiling_points", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0:\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37001,8 +61742,13 @@ "data": { "event": { "delta": { - "text": " = {\n \"water\": 100,\n \"polyjuice\":", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37021,8 +61767,13 @@ "data": { "event": { "delta": { - "text": " 120 # Assuming poly", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " False", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37041,8 +61792,13 @@ "data": { "event": { "delta": { - "text": "juice has a boiling point of 120 degrees Cel", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37061,8 +61817,13 @@ "data": { "event": { "delta": { - "text": "cius\n }\n if liquid", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37081,8 +61842,13 @@ "data": { "event": { "delta": { - "text": "_name in boiling_points:\n if celcius:\n return", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " True\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37101,8 +61867,13 @@ "data": { "event": { "delta": { - "text": " boiling_points[liquid_name]\n else:\n return boiling_points[liquid", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "def nth", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37121,8 +61892,13 @@ "data": { "event": { "delta": { - "text": "_name] * 9/5 + ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_prime(n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37141,8 +61917,13 @@ "data": { "event": { "delta": { - "text": "32\n else:\n return \"Boiling point not found", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "):\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37161,8 +61942,13 @@ "data": { "event": { "delta": { - "text": "\"\n\nNow, the function call", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " count", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37181,8 +61967,13 @@ "data": { "event": { "delta": { - "text": " should be: \n", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " =", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37201,8 +61992,13 @@ "data": { "event": { "delta": { - "text": "[get_boiling_point(liquid_name=\"polyju", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 0", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37221,8 +62017,13 @@ "data": { "event": { "delta": { - "text": "ice\", celcius=True)]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37241,94 +62042,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " num =", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "NnkGeCwM", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213901+00:00", - "__module__": "datetime" - }, - "trace_id": "7ifSRjCjRIioDOte", - "type": "metric", - "unit": "tokens", - "value": 86 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "NnkGeCwM", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213925+00:00", - "__module__": "datetime" - }, - "trace_id": "7ifSRjCjRIioDOte", - "type": "metric", - "unit": "tokens", - "value": 234 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "NnkGeCwM", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:35.213931+00:00", - "__module__": "datetime" - }, - "trace_id": "7ifSRjCjRIioDOte", - "type": "metric", - "unit": "tokens", - "value": 320 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 2", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -37342,8 +62092,13 @@ "data": { "event": { "delta": { - "text": "[", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37362,8 +62117,13 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " while True", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37382,8 +62142,13 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37405,16 +62170,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "d43b2636-903d-430d-8389-91eefe5a1d75", - "tool_name": "get_boiling_point" + "value": "in_progress" }, + "tool_call": " ", "type": "tool_call" }, "event_type": { @@ -37423,11 +62181,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -37438,94 +62192,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if is", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221646+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221673+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221680+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", - "value": 58 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_prime(num", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -37539,8 +62242,13 @@ "data": { "event": { "delta": { - "text": "[", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "):\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37559,8 +62267,13 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " count +=", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37579,8 +62292,13 @@ "data": { "event": { "delta": { - "text": "='polyjuice', celcius=True)]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 1", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37602,16 +62320,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "0548b2ef-daa4-4099-bb2c-b34f00752339", - "tool_name": "get_boiling_point" + "value": "in_progress" }, + "tool_call": "\n ", "type": "tool_call" }, "event_type": { @@ -37620,11 +62331,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -37635,94 +62342,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if count", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366139+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366166+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366172+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", - "value": 58 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " == n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -37736,8 +62392,13 @@ "data": { "event": { "delta": { - "text": "Poly", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37756,8 +62417,13 @@ "data": { "event": { "delta": { - "text": "juice is a fictional potion from", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return num", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37776,8 +62442,13 @@ "data": { "event": { "delta": { - "text": " the Harry Potter series by J.K. Rowling. As it", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37796,8 +62467,13 @@ "data": { "event": { "delta": { - "text": "'s not a real substance, it doesn't have a boiling point", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " num +=", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37816,8 +62492,13 @@ "data": { "event": { "delta": { - "text": ". Polyjuice Potion is a magical concoction", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 1", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37836,8 +62517,13 @@ "data": { "event": { "delta": { - "text": " that allows the drinker to assume the form and", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n\nprint", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37856,8 +62542,13 @@ "data": { "event": { "delta": { - "text": " appearance", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(nth", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37876,8 +62567,13 @@ "data": { "event": { "delta": { - "text": " of another person, but it's not a physical substance that can", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_prime(", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37896,8 +62592,13 @@ "data": { "event": { "delta": { - "text": " be measured or analyzed in the same way as real-world", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "100))", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37916,8 +62617,23 @@ "data": { "event": { "delta": { - "text": " chemicals.\n\nIf you", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "def is_prime(n):\n if n <= 1:\n return False\n if n == 2:\n return True\n if n % 2 == 0:\n return False\n max_divisor = int(n**0.5) + 1\n for d in range(3, max_divisor, 2):\n if n % d == 0:\n return False\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" + }, + "call_id": "67e0a41c-4428-47a9-b276-df436c014992", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37925,7 +62641,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -37936,33 +62656,58 @@ "data": { "event": { "delta": { - "text": " have any other questions or", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 50 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity the company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " if there's anything else I can help you with, feel free to ask", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37976,7 +62721,7 @@ "data": { "event": { "delta": { - "text": "!", + "text": "Per", "type": "text" }, "event_type": { @@ -37996,94 +62741,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "plexity", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "M0oC9v8Y", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531648+00:00", - "__module__": "datetime" - }, - "trace_id": "0CMlh2kQShSVm3zE", - "type": "metric", - "unit": "tokens", - "value": 30 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "M0oC9v8Y", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531666+00:00", - "__module__": "datetime" - }, - "trace_id": "0CMlh2kQShSVm3zE", - "type": "metric", - "unit": "tokens", - "value": 113 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "M0oC9v8Y", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531671+00:00", - "__module__": "datetime" - }, - "trace_id": "0CMlh2kQShSVm3zE", - "type": "metric", - "unit": "tokens", - "value": 143 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " the company", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -38097,7 +62781,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": " was founded", "type": "text" }, "event_type": { @@ -38117,7 +62801,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice', cel", + "text": " in ", "type": "text" }, "event_type": { @@ -38137,7 +62821,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": "2022", "type": "text" }, "event_type": { @@ -38157,20 +62841,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "acbb04a1-08f4-4277-9b66-aadda2fa2be7", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": ".", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38178,11 +62850,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -38210,55 +62878,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175063+00:00", - "__module__": "datetime" - }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", - "value": 30 + "unit": null, + "value": 68 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175128+00:00", - "__module__": "datetime" - }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", - "value": 28 + "unit": null, + "value": 22 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175137+00:00", - "__module__": "datetime" - }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", - "value": 58 + "unit": null, + "value": 90 } ] } @@ -38266,7 +62898,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -38294,7 +62926,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[k", "type": "text" }, "event_type": { @@ -38314,7 +62946,7 @@ "data": { "event": { "delta": { - "text": " 100th prime number is 541", + "text": "nowledge_search", "type": "text" }, "event_type": { @@ -38334,7 +62966,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": "(query=\"", "type": "text" }, "event_type": { @@ -38345,103 +62977,22 @@ "logprobs": null, "stop_reason": null }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404182+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 252 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404224+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 20 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404230+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 272 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "Perplex", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -38455,13 +63006,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "ity the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38480,13 +63026,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "def is_prime(n):\n if n <= 1:\n return False", - "type": "tool_call" + "text": " company founding", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38505,13 +63046,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n if n <= 3:\n return True", - "type": "tool_call" + "text": " date\")]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38533,9 +63069,15 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Perplexity the company founding date" + }, + "call_id": "1d69f6cc-160a-47d8-a452-4deb322954fd", + "tool_name": "knowledge_search" }, - "tool_call": "\n if n % 2 == 0 or n % 3", "type": "tool_call" }, "event_type": { @@ -38544,7 +63086,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -38555,18 +63101,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " == 0:\n return False\n i = 5\n ", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 29 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 52 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"NBA creation date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -38580,13 +63166,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " while i * i <= n:\n if n", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38605,13 +63186,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " % i == 0 or n % (i", - "type": "tool_call" + "text": " NBA was", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38630,13 +63206,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " + 2) == 0:\n return False\n i +=", - "type": "tool_call" + "text": " created on", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38655,13 +63226,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 6\n return True\n\ndef nth_prime(n):\n count =", - "type": "tool_call" + "text": " August ", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38680,13 +63246,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 0\n num = 2\n while True:\n if", - "type": "tool_call" + "text": "3", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38705,13 +63266,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " is_prime(num):\n count += 1\n if count == n", - "type": "tool_call" + "text": ", ", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38730,13 +63286,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\n return num\n num += 1\n\nprint(nth_prime", - "type": "tool_call" + "text": "1949", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38755,13 +63306,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(100))", - "type": "tool_call" + "text": ", with", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38780,23 +63326,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" - }, - "call_id": "e1110bc1-dc83-480d-ad33-09d49f5ccc8d", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " the merger", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38804,11 +63335,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -38819,94 +63346,53 @@ "data": { "event": { "delta": { - "text": "", + "text": " of", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "5J3hM-La", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121100+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 40 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "5J3hM-La", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121127+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 10 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "5J3hM-La", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121132+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 50 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity the company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " Basketball Association", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -38920,7 +63406,7 @@ "data": { "event": { "delta": { - "text": "Per", + "text": " of America", "type": "text" }, "event_type": { @@ -38940,7 +63426,7 @@ "data": { "event": { "delta": { - "text": "plexity the company was founded in 2022.", + "text": " (B", "type": "text" }, "event_type": { @@ -38960,94 +63446,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "AA)", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "6jxCq3gU", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430436+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", - "value": 68 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "6jxCq3gU", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430477+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", - "value": 22 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "6jxCq3gU", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430489+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", - "value": 90 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " and the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -39061,7 +63486,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": " National", "type": "text" }, "event_type": { @@ -39081,7 +63506,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Perplexity the company", + "text": " Basketball League", "type": "text" }, "event_type": { @@ -39101,7 +63526,7 @@ "data": { "event": { "delta": { - "text": " founding date\")]", + "text": " (N", "type": "text" }, "event_type": { @@ -39121,19 +63546,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Perplexity the company founding date" - }, - "call_id": "199ef050-bc11-4e4b-935d-f5241c3f40ef", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": "BL).", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -39141,11 +63555,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -39173,55 +63583,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "m4wMGuSN", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880525+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", - "value": 29 + "unit": null, + "value": 63 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "m4wMGuSN", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880576+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", - "value": 23 + "unit": null, + "value": 45 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "m4wMGuSN", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880585+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", - "value": 52 + "unit": null, + "value": 108 } ] } @@ -39229,7 +63603,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"NBA creation date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -39257,7 +63631,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[k", "type": "text" }, "event_type": { @@ -39277,7 +63651,7 @@ "data": { "event": { "delta": { - "text": " NBA was created on August 3, 1949, with", + "text": "nowledge_search", "type": "text" }, "event_type": { @@ -39297,7 +63671,7 @@ "data": { "event": { "delta": { - "text": " the merger of the Basketball Association of America (BAA) and the National", + "text": "(query=\"", "type": "text" }, "event_type": { @@ -39317,7 +63691,7 @@ "data": { "event": { "delta": { - "text": " Basketball League (NBL).", + "text": "NBA", "type": "text" }, "event_type": { @@ -39337,108 +63711,7 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "OyfVMRgR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322420+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", - "value": 63 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "OyfVMRgR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322482+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", - "value": 45 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "OyfVMRgR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322490+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", - "value": 108 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", + "text": " creation date", "type": "text" }, "event_type": { @@ -39458,7 +63731,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"NBA creation date\")]", + "text": "\")]", "type": "text" }, "event_type": { @@ -39487,7 +63760,7 @@ "arguments": { "query": "NBA creation date" }, - "call_id": "388e55ab-448a-4a98-905b-196c051bdeea", + "call_id": "bac8b49d-537e-4c73-bb9e-c06475903366", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -39530,54 +63803,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "QpFMmy3B", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235138+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 27 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "QpFMmy3B", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235160+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 20 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "QpFMmy3B", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235165+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 47 } ] diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index 76191e992f..f52e34333d 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -25,6 +25,19 @@ } } }, + "[[], {\"kwargs\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n == 2:\\n return True\\n if n % 2 == 0:\\n return False\\n max_divisor = int(n**0.5) + 1\\n for d in range(3, max_divisor, 2):\\n if n % d == 0:\\n return False\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -64,6 +77,19 @@ } } }, + "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -129,6 +155,19 @@ } } }, + "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -374,23 +413,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:24443\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:122a9\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:700ad\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:b49f7\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:122a9\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -402,11 +441,11 @@ "error_message": null, "metadata": { "document_ids": [ - "24443dfb-a0b3-4ce8-820e-3fb1f12364bb", - "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", - "b49f7985-6615-4dcf-99be-d1765b6a6fc6", - "961ff2d1-8887-41ef-a4fe-fa4cbab7b932", - "b49f7985-6615-4dcf-99be-d1765b6a6fc6" + "895539dd-a627-4c02-94d7-6591cd0ce00f", + "700ad5a6-e318-48ad-99b2-93934c5d7f8c", + "122a966a-6d33-4482-87a6-f5d16e9f92be", + "700ad5a6-e318-48ad-99b2-93934c5d7f8c", + "122a966a-6d33-4482-87a6-f5d16e9f92be" ] } } @@ -418,7 +457,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\", \"url\": \"https://observer.com/2024/01/meta-facebook-top-executives/\", \"content\": \"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\", \"score\": 0.45536873, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", + "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79080546, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05570498, \"raw_content\": null}]}", "error_code": null, "error_message": null, "metadata": null @@ -437,23 +476,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:20e5d\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:700ad\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:20e5d\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:700ad\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:20e5d\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe Date: Wed, 12 Mar 2025 18:14:33 -0700 Subject: [PATCH 08/14] comments --- tests/integration/datasetio/test_datasetio.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py index 64e187fb5d..1885c9bf81 100644 --- a/tests/integration/datasetio/test_datasetio.py +++ b/tests/integration/datasetio/test_datasetio.py @@ -17,13 +17,13 @@ @pytest.fixture -def test_dataset(llama_stack_client): - register_dataset(llama_stack_client) - yield # This is where the test function will run - +def dataset_for_test(llama_stack_client): + dataset_id = "test_dataset" + register_dataset(llama_stack_client, dataset_id=dataset_id) + yield # Teardown - this always runs, even if the test fails try: - llama_stack_client.datasets.unregister("test_dataset") + llama_stack_client.datasets.unregister(dataset_id) except Exception as e: print(f"Warning: Failed to unregister test_dataset: {e}") @@ -94,7 +94,7 @@ def test_register_unregister_dataset(llama_stack_client): assert len(response) == 0 -def test_get_rows_paginated(llama_stack_client, test_dataset): +def test_get_rows_paginated(llama_stack_client, dataset_for_test): response = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", rows_in_page=3, From 59b12630904ead9f803f9beede26829c0d12f6e8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 18:15:15 -0700 Subject: [PATCH 09/14] comments --- tests/integration/datasetio/test_datasetio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py index 1885c9bf81..459589e7b0 100644 --- a/tests/integration/datasetio/test_datasetio.py +++ b/tests/integration/datasetio/test_datasetio.py @@ -20,7 +20,7 @@ def dataset_for_test(llama_stack_client): dataset_id = "test_dataset" register_dataset(llama_stack_client, dataset_id=dataset_id) - yield + yield # Teardown - this always runs, even if the test fails try: llama_stack_client.datasets.unregister(dataset_id) From f1f0b7bb22deef742108fde5436fe7fc8d957532 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 18:16:28 -0700 Subject: [PATCH 10/14] rename scoring --- tests/integration/scoring/test_scoring.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index a2d8500d58..3f0f832c12 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -11,7 +11,7 @@ @pytest.fixture -def test_dataset_rag(llama_stack_client): +def rag_dataset_for_test(llama_stack_client): register_dataset(llama_stack_client, for_rag=True) yield # This is where the test function will run @@ -91,7 +91,7 @@ def test_scoring_functions_register( # TODO: add unregister api for scoring functions -def test_scoring_score(llama_stack_client, test_dataset_rag): +def test_scoring_score(llama_stack_client, rag_dataset_for_test): # scoring individual rows rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", @@ -126,7 +126,7 @@ def test_scoring_score(llama_stack_client, test_dataset_rag): def test_scoring_score_with_params_llm_as_judge( - llama_stack_client, sample_judge_prompt_template, judge_model_id, test_dataset_rag + llama_stack_client, sample_judge_prompt_template, judge_model_id, rag_dataset_for_test ): # scoring individual rows rows = llama_stack_client.datasetio.get_rows_paginated( @@ -177,7 +177,7 @@ def test_scoring_score_with_params_llm_as_judge( ], ) def test_scoring_score_with_aggregation_functions( - llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id, test_dataset_rag + llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id, rag_dataset_for_test ): rows = llama_stack_client.datasetio.get_rows_paginated( dataset_id="test_dataset", From 329164ba09d7e6b88e25fe0f2a1211aa81809853 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 22:28:19 -0700 Subject: [PATCH 11/14] add dataset_id --- tests/integration/scoring/test_scoring.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index 3f0f832c12..970a96f407 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -12,12 +12,13 @@ @pytest.fixture def rag_dataset_for_test(llama_stack_client): - register_dataset(llama_stack_client, for_rag=True) + dataset_id = "test_dataset" + register_dataset(llama_stack_client, for_rag=True, dataset_id=dataset_id) yield # This is where the test function will run # Teardown - this always runs, even if the test fails try: - llama_stack_client.datasets.unregister("test_dataset") + llama_stack_client.datasets.unregister(dataset_id) except Exception as e: print(f"Warning: Failed to unregister test_dataset: {e}") From 0ae5c08363b8355119425d2235d2e1be02e64d74 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 12 Mar 2025 22:29:25 -0700 Subject: [PATCH 12/14] revert mocks --- .../recorded_responses/chat_completion.json | 31337 +++++----------- .../recorded_responses/invoke_tool.json | 117 +- 2 files changed, 9980 insertions(+), 21474 deletions(-) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 71fbc9361b..8694cc2713 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -12535,7 +12535,23 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 139 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 162 + } + ] } } ], @@ -12589,7 +12605,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\":", + "text": "type\": \"function\", \"name\": \"", "type": "text" }, "event_type": { @@ -12609,7 +12625,7 @@ "data": { "event": { "delta": { - "text": " \"get_boiling_point\", \"parameters\":", + "text": "get_boiling_point\", \"parameters\": {\"", "type": "text" }, "event_type": { @@ -12629,7 +12645,7 @@ "data": { "event": { "delta": { - "text": " {\"liquid_name\": \"polyjuice", + "text": "liquid_name\": \"polyjuice\", \"celcius\":", "type": "text" }, "event_type": { @@ -12649,7 +12665,7 @@ "data": { "event": { "delta": { - "text": "\", \"celcius\": \"false\"}}", + "text": " \"false\"}}", "type": "text" }, "event_type": { @@ -12679,7 +12695,7 @@ "celcius": "false", "liquid_name": "polyjuice" }, - "call_id": "bffe07d7-343f-49c4-bcff-d83c99fa7d4a", + "call_id": "fc7e2525-3e7b-47ff-8731-12dd7655dfd6", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -12720,13 +12736,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 91 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 136 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -12754,7 +12786,7 @@ "data": { "event": { "delta": { - "text": "{\n", + "text": "The", "type": "text" }, "event_type": { @@ -12774,7 +12806,7 @@ "data": { "event": { "delta": { - "text": " \"type\": \"function\",\n \"name\": \"get", + "text": " boiling point of polyjuice is -100 degrees Fahrenheit.", "type": "text" }, "event_type": { @@ -12794,33 +12826,58 @@ "data": { "event": { "delta": { - "text": "_boiling_point\",\n \"parameters\": {\n \"liquid_name", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 139 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 162 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "\": \"polyjuice\",\n ", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -12834,7 +12891,7 @@ "data": { "event": { "delta": { - "text": " \"celcius\": \"true\"\n }\n}", + "text": "{\"", "type": "text" }, "event_type": { @@ -12854,20 +12911,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": "true", - "liquid_name": "polyjuice" - }, - "call_id": "41ce6bfb-81c1-438d-8520-329c4446f1bc", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": "type\": \"function\", \"name\": \"get_boiling_point\", \"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -12875,11 +12920,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -12890,42 +12931,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": "parameters\": {\"liquid_name\": \"polyjuice\", \"", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -12939,7 +12951,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "celcius\": \"false\"}}", "type": "text" }, "event_type": { @@ -12959,8 +12971,20 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100\u00b0C.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "false", + "liquid_name": "polyjuice" + }, + "call_id": "1ef7adda-5ebb-41d5-a2c6-3e6700de5f81", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -12968,7 +12992,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -12994,13 +13022,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 91 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 136 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -13028,7 +13072,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "{\n", "type": "text" }, "event_type": { @@ -13048,7 +13092,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100\u00b0C.", + "text": " \"type\": \"function\",\n \"name\": \"get", "type": "text" }, "event_type": { @@ -13068,88 +13112,9 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": "_boiling_point\",\n \"parameters\": {\n \"liquid", "type": "text" }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point", - "type": "tool_call" - }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", @@ -13167,13 +13132,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\", \"parameters\": {\"liquid_name\": \"polyjuice\", \"cel", - "type": "tool_call" + "text": "_name\": \"polyjuice\",\n \"celci", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13192,13 +13152,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "cius\": \"true\"}}", - "type": "tool_call" + "text": "us\": \"true\"\n }\n}", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13227,7 +13182,7 @@ "celcius": "true", "liquid_name": "polyjuice" }, - "call_id": "6161b956-9b68-4e88-87bf-e26a07d4c7ca", + "call_id": "40293d5b-8a76-4df5-8325-d6e8755ba513", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -13268,13 +13223,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 43 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 55 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 98 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -13302,38 +13273,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13352,13 +13293,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_point_with_metadata\", \"parameters\": {\"liquid_name\": \"polyju", - "type": "tool_call" + "text": " boiling point of polyjuice is", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13377,13 +13313,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "ice\", \"celcius\": \"true\"}}", - "type": "tool_call" + "text": " -100\u00b0C.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13396,42 +13327,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": "true", - "liquid_name": "polyjuice" - }, - "call_id": "11da4a37-d7ad-468a-98c8-0f1e295d14a9", - "tool_name": "get_boiling_point_with_metadata" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -13453,13 +13348,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 85 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 107 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -13507,7 +13418,7 @@ "data": { "event": { "delta": { - "text": " customer smiled and said \"hello\" to the friendly store clerk.", + "text": " boiling point of polyjuice is -100\u00b0C.", "type": "text" }, "event_type": { @@ -13542,13 +13453,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 85 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 107 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -13596,7 +13523,7 @@ "data": { "event": { "delta": { - "text": " error message indicates that the `bwrap.core` module is", + "text": " boiling point of polyjuice is -100", "type": "text" }, "event_type": { @@ -13616,7 +13543,7 @@ "data": { "event": { "delta": { - "text": " not found. This is because the", + "text": "\u00b0C.", "type": "text" }, "event_type": { @@ -13636,53 +13563,58 @@ "data": { "event": { "delta": { - "text": " `bwrap.core` module is not a standard Python module", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " and is not installed by default.\n\nTo", - "type": "text" + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 87 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "metric": "completion_tokens", + "unit": null, + "value": 22 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "metric": "total_tokens", + "unit": null, + "value": 109 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " fix this issue, you can use", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -13696,7 +13628,7 @@ "data": { "event": { "delta": { - "text": " the `pathlib` module to access the file directly. Here", + "text": "The", "type": "text" }, "event_type": { @@ -13716,7 +13648,7 @@ "data": { "event": { "delta": { - "text": "'s an updated code snippet:\n\n```python\nimport pandas", + "text": " boiling point of polyjuice is -", "type": "text" }, "event_type": { @@ -13736,7 +13668,7 @@ "data": { "event": { "delta": { - "text": " as pd\nfrom pathlib import Path\n\nfile_path", + "text": "100 degrees Celcius.", "type": "text" }, "event_type": { @@ -13756,53 +13688,58 @@ "data": { "event": { "delta": { - "text": " = Path(\"/var/folders/cz/vyh7y", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "1d11xg881lsxsshnc5c0000gn", - "type": "text" + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 87 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "metric": "completion_tokens", + "unit": null, + "value": 25 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "metric": "total_tokens", + "unit": null, + "value": 112 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "/T/tmpeipex0j0", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -13816,8 +13753,13 @@ "data": { "event": { "delta": { - "text": "/b807hgTQinflation.csv\")\ndf = pd.read_csv", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13836,8 +13778,13 @@ "data": { "event": { "delta": { - "text": "(file_path)\nprint(df.head())\n```\n\n", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13856,8 +13803,13 @@ "data": { "event": { "delta": { - "text": "This code uses the `Path` class from the `pathlib", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "name\": \"get_boiling_point\", \"parameters", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13876,8 +13828,13 @@ "data": { "event": { "delta": { - "text": "` module to create a path object for the file. The `", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\": {\"liquid_name\": \"polyjuice\", \"celci", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13896,8 +13853,13 @@ "data": { "event": { "delta": { - "text": "read_csv` method is then used to read the CSV file into", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "us\": \"true\"}}", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13916,8 +13878,20 @@ "data": { "event": { "delta": { - "text": " a pandas DataFrame.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "true", + "liquid_name": "polyjuice" + }, + "call_id": "f146d04b-c400-4193-a6d8-ccfea7f7b529", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -13925,7 +13899,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -13951,13 +13929,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -13985,8 +13979,13 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14005,8 +14004,13 @@ "data": { "event": { "delta": { - "text": "'m unable to access the file you provided", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14025,8 +14029,13 @@ "data": { "event": { "delta": { - "text": ". However, I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "get_boiling_point_with_metadata\", \"parameters\": {\"liquid", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14045,8 +14054,13 @@ "data": { "event": { "delta": { - "text": " can suggest how you can describe the CSV file using the pandas library in Python.\n\nYou can use the `head()`, `dtypes`, and `describe()` functions to get an overview of the CSV file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_name\": \"polyjuice\", \"celci", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14065,8 +14079,13 @@ "data": { "event": { "delta": { - "text": ".\n\n- `head()`: This function prints the first few rows of the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "us\": \"", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14085,8 +14104,13 @@ "data": { "event": { "delta": { - "text": " dataframe, giving you an idea of what the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "true\"}}", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14105,8 +14129,20 @@ "data": { "event": { "delta": { - "text": " data looks like.\n- `dtypes`: This", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": "true", + "liquid_name": "polyjuice" + }, + "call_id": "d6b8a25d-9b4c-4650-bbe6-f94b5fa97e56", + "tool_name": "get_boiling_point_with_metadata" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14114,7 +14150,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -14125,33 +14165,58 @@ "data": { "event": { "delta": { - "text": " function prints the data types of each column in the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " dataframe.\n- `describe()`: This function prints summary", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -14165,7 +14230,7 @@ "data": { "event": { "delta": { - "text": " statistics of the dataframe, including mean, standard deviation, minimum, maximum,", + "text": "The", "type": "text" }, "event_type": { @@ -14185,7 +14250,7 @@ "data": { "event": { "delta": { - "text": " and quartiles for numeric columns, and count and unique values for", + "text": " customer smiled and said \"hello\" to the friendly store", "type": "text" }, "event_type": { @@ -14205,7 +14270,7 @@ "data": { "event": { "delta": { - "text": " object columns.\n\nIf you want to get more information about the CSV file,", + "text": " clerk.", "type": "text" }, "event_type": { @@ -14225,33 +14290,58 @@ "data": { "event": { "delta": { - "text": " you can use the `info()` function, which prints a concise summary", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 24 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 54 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " of the dataframe, including the index dtype and column dtypes, non-", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -14265,7 +14355,7 @@ "data": { "event": { "delta": { - "text": "nullable values, and memory usage.\n\nPlease make sure the file is in the", + "text": "The", "type": "text" }, "event_type": { @@ -14285,7 +14375,7 @@ "data": { "event": { "delta": { - "text": " correct format and is accessible to the Python script.", + "text": " error message indicates that the `bwrap.core` module is", "type": "text" }, "event_type": { @@ -14305,42 +14395,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(code_interpreter.get_file_path(\\\"\"))\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " not found. This is because the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -14354,7 +14415,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " `bwrap.core` module is not a standard Python module", "type": "text" }, "event_type": { @@ -14374,7 +14435,7 @@ "data": { "event": { "delta": { - "text": " error message indicates that the `bwrap.core` module is not found", + "text": " and is not installed by default.\n\nTo", "type": "text" }, "event_type": { @@ -14394,7 +14455,7 @@ "data": { "event": { "delta": { - "text": ". This is likely because the `bwrap` library is not installed", + "text": " fix this issue, you can use", "type": "text" }, "event_type": { @@ -14414,7 +14475,7 @@ "data": { "event": { "delta": { - "text": ". To fix this, you can install the", + "text": " the `pathlib` module to access the file directly. Here", "type": "text" }, "event_type": { @@ -14434,7 +14495,7 @@ "data": { "event": { "delta": { - "text": " `bwrap` library using pip:\n\n```\npip install", + "text": "'s an updated code snippet:\n\n```python\nimport pandas", "type": "text" }, "event_type": { @@ -14454,7 +14515,7 @@ "data": { "event": { "delta": { - "text": " bwrap\n```\n\nIf you are still facing issues", + "text": " as pd\nfrom pathlib import Path\n\nfile_path", "type": "text" }, "event_type": { @@ -14474,7 +14535,7 @@ "data": { "event": { "delta": { - "text": ", you can try to use the `code_interpreter.get_file_path", + "text": " = Path(\"/var/folders/cz/vyh7y", "type": "text" }, "event_type": { @@ -14494,7 +14555,7 @@ "data": { "event": { "delta": { - "text": "()` function to load the CSV file directly, as shown in the corrected", + "text": "1d11xg881lsxsshnc5c0000gn", "type": "text" }, "event_type": { @@ -14514,7 +14575,7 @@ "data": { "event": { "delta": { - "text": " code above.\n\nAlternatively, if you don't have access to the `code", + "text": "/T/tmpeipex0j0", "type": "text" }, "event_type": { @@ -14534,7 +14595,7 @@ "data": { "event": { "delta": { - "text": "_interpreter` library, you can use the `pandas.read_csv", + "text": "/b807hgTQinflation.csv\")\ndf = pd.read_csv", "type": "text" }, "event_type": { @@ -14554,7 +14615,7 @@ "data": { "event": { "delta": { - "text": "()` function with the file path as a string:\n\n```\ndf = pd", + "text": "(file_path)\nprint(df.head())\n```\n\n", "type": "text" }, "event_type": { @@ -14574,7 +14635,7 @@ "data": { "event": { "delta": { - "text": ".read_csv(\"/var/folders/cz/vyh7y1d11", + "text": "This code uses the `Path` class from the `pathlib", "type": "text" }, "event_type": { @@ -14594,7 +14655,7 @@ "data": { "event": { "delta": { - "text": "xg881lsxsshnc5c0000gn/T/tmp4ed", + "text": "` module to create a path object for the file. The `", "type": "text" }, "event_type": { @@ -14614,7 +14675,7 @@ "data": { "event": { "delta": { - "text": "7p2bg/Csr659svinflation.csv\")\n```\n\nThis", + "text": "read_csv` method is then used to read the CSV file into", "type": "text" }, "event_type": { @@ -14634,7 +14695,7 @@ "data": { "event": { "delta": { - "text": " should load the CSV file and allow you to inspect its contents.", + "text": " a pandas DataFrame.", "type": "text" }, "event_type": { @@ -14675,7 +14736,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nYear Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec\\n0 2014 1.6 1.6 1.7 1.8 2.0 1.9 1.9 1.7 1.7 1.8 1.7 1.6\\n1 2015 1.6 1.7 1.8 1.8 1.7 1.8 1.8 1.8 1.9 1.9 2.0 2.1\\n2 2016 2.2 2.3 2.2 2.1 2.2 2.2 2.2 2.3 2.2 2.1 2.1 2.2\\n3 2017 2.3 2.2 2.0 1.9 1.7 1.7 1.7 1.7 1.7 1.8 1.7 1.8\\n4 2018 1.8 1.8 2.1 2.1 2.2 2.3 2.4 2.2 2.2 2.1 2.2 2.2\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -14703,38 +14764,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14753,13 +14784,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\ndf = pd.read_csv(code_interpreter.get_file_path(\"/var", - "type": "tool_call" + "text": " csv file contains data on inflation rates for each month of the year from", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14778,13 +14804,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/folders/cz/vyh7y1", - "type": "tool_call" + "text": " 2014 to 2018. The columns are:\n\n- Year", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14803,13 +14824,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "d11xg881lsxsshnc5c0000gn/T", - "type": "tool_call" + "text": ": The year of the inflation rate\n-", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14828,13 +14844,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmp4ed7p2bg/Csr659svinflation.csv\"))\n\n", - "type": "tool_call" + "text": " Jan to Dec: The inflation rate for each month of", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14853,13 +14864,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print the first few rows of the dataframe\nprint(df.head", - "type": "tool_call" + "text": " the year\n\nThe inflation rates are all in the range of ", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14878,13 +14884,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n", - "type": "tool_call" + "text": "1.6 to 2.4, indicating", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14903,13 +14904,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print the summary statistics of the dataframe\nprint(df.describe())", - "type": "tool_call" + "text": " a relatively stable inflation rate over the years.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -14922,45 +14918,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(code_interpreter.get_file_path(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/Csr659svinflation.csv\"))\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" - }, - "call_id": "c5d0fce3-d7c6-4da1-89e4-e727df42f356", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -14982,13 +14939,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 471 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 91 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 562 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -15016,13 +14989,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15041,13 +15009,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n", - "type": "tool_call" + "text": " error message indicates that there is an issue with the import statement. However", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15066,13 +15029,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import code_interpreter\n\n# Load the CSV file\ndf =", - "type": "tool_call" + "text": ", the code provided does not contain any import statements that", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15091,13 +15049,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " pd.read_csv(\"/var/folders", - "type": "tool_call" + "text": " would cause this error.\n\nTo provide a more accurate answer, I", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15116,13 +15069,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/cz/vyh7y1d11xg881", - "type": "tool_call" + "text": " would need to know the contents of the CSV file or more information", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15141,13 +15089,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "lsxsshnc5c0000gn/T/tmp4ed7", - "type": "tool_call" + "text": " about the error message.\n\nHowever, based on the code provided, it", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15166,13 +15109,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "p2bg/Csr659svinflation.csv\")\n\n# Print", - "type": "tool_call" + "text": " seems like the code is trying to load a CSV", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15191,13 +15129,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the first few rows of the dataframe\nprint(df.head())\n\n#", - "type": "tool_call" + "text": " file and print some basic information about it. If the file is", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15216,13 +15149,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Print the data types of each column\nprint(df.dtypes)\n\n", - "type": "tool_call" + "text": " not found or there is an issue with the file", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15241,13 +15169,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print the summary statistics of the dataframe", - "type": "tool_call" + "text": " path, this could cause an error.\n\nHere is a revised version", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15266,13 +15189,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\nprint(df.describe())", - "type": "tool_call" + "text": " of the code that includes some error handling:\n\n``", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15291,23 +15209,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/Csr659svinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" - }, - "call_id": "8aeab20b-341b-4349-84dc-3e3c3299d713", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": "`\nimport pandas as pd\nimport code_interpreter\n\ntry:\n", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -15315,11 +15218,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -15330,42 +15229,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " # Load the CSV file\n df = pd.read_csv(\"/", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "var/folders/cz/vyh7y1d11x", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -15379,7 +15269,7 @@ "data": { "event": { "delta": { - "text": "This", + "text": "g881lsxsshnc5c0000gn/T/tmp", "type": "text" }, "event_type": { @@ -15399,7 +15289,7 @@ "data": { "event": { "delta": { - "text": " code will create a line plot of", + "text": "_d_cdeif/6TpkUAo", "type": "text" }, "event_type": { @@ -15419,7 +15309,7 @@ "data": { "event": { "delta": { - "text": " the average yearly inflation over time. The x", + "text": "0inflation.csv\")\n\n # Print the first few rows of the", "type": "text" }, "event_type": { @@ -15439,7 +15329,7 @@ "data": { "event": { "delta": { - "text": "-axis represents the year and the y-axis represents the", + "text": " dataframe\n print(df.head())\n\n # Print the", "type": "text" }, "event_type": { @@ -15459,7 +15349,7 @@ "data": { "event": { "delta": { - "text": " average inflation. Each point on the plot represents", + "text": " data types of each column\n print(df", "type": "text" }, "event_type": { @@ -15479,7 +15369,7 @@ "data": { "event": { "delta": { - "text": " the average inflation for a particular year.\n\nPlease note that you", + "text": ".dtypes)\n\n # Print the summary statistics of the dataframe\n", "type": "text" }, "event_type": { @@ -15499,7 +15389,7 @@ "data": { "event": { "delta": { - "text": " need to replace 'inflation.csv'", + "text": " print(df.describe())\n\nexcept FileNotFoundError:\n print(\"The file was", "type": "text" }, "event_type": { @@ -15519,7 +15409,7 @@ "data": { "event": { "delta": { - "text": " with the actual path to your csv file. Also,", + "text": " not found.\")\nexcept pd.errors.EmptyDataError:\n print(\"The", "type": "text" }, "event_type": { @@ -15539,7 +15429,7 @@ "data": { "event": { "delta": { - "text": " this code assumes that the 'date' column in your csv", + "text": " file is empty.\")\nexcept pd.errors.ParserError:\n print(\"An", "type": "text" }, "event_type": { @@ -15559,7 +15449,7 @@ "data": { "event": { "delta": { - "text": " file is in a format that can be parsed by pandas' `to", + "text": " error occurred while parsing the file.\")\nexcept Exception as e:\n print", "type": "text" }, "event_type": { @@ -15579,7 +15469,7 @@ "data": { "event": { "delta": { - "text": "_datetime` function. If the date is in a different", + "text": "(\"An error occurred: \", str(e))\n```\n\nThis code will", "type": "text" }, "event_type": { @@ -15599,7 +15489,7 @@ "data": { "event": { "delta": { - "text": " format, you may need to specify the format using the `format", + "text": " catch specific exceptions that could occur when loading the CSV file and print a", "type": "text" }, "event_type": { @@ -15619,7 +15509,7 @@ "data": { "event": { "delta": { - "text": "` parameter of `to_datetime`.", + "text": " more informative error message.", "type": "text" }, "event_type": { @@ -15654,13 +15544,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 391 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 330 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 721 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -15718,57 +15624,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data\ndf = pd.read_csv('inflation.csv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "')\n\n# Convert 'date' column to datetime\ndf['date']", + "tool_call": "import pandas as pd\nimport code_interpreter\n\n", "type": "tool_call" }, "event_type": { @@ -15793,7 +15649,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " = pd.to_datetime(df['date'])\n\n# Group by year and calculate", + "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var", "type": "tool_call" }, "event_type": { @@ -15818,7 +15674,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " average inflation\naverage_inflation = df.groupby(df['date'].dt", + "tool_call": "/folders/cz/vyh7y1d11xg881", "type": "tool_call" }, "event_type": { @@ -15843,7 +15699,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ".year)['inflation'].mean()\n\n# Plot", + "tool_call": "lsxsshnc5c0000gn/T/tmp_d_cdeif", "type": "tool_call" }, "event_type": { @@ -15868,7 +15724,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " the time series\nplt.figure(figsize=(", + "tool_call": "/6TpkUAo0inflation.csv\")\n\n# Print the", "type": "tool_call" }, "event_type": { @@ -15893,7 +15749,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "10,6))\nplt.plot(average", + "tool_call": " first few rows of the dataframe\nprint(df.head())\n\n# Print the", "type": "tool_call" }, "event_type": { @@ -15918,7 +15774,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_inflation.index, average_inflation.values", + "tool_call": " data types of each column\nprint(df.dtypes)\n\n# Print the", "type": "tool_call" }, "event_type": { @@ -15943,57 +15799,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ", marker='o')\nplt.title('Average Yearly Inflation')\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(True)\nplt.show()", + "tool_call": " summary statistics of the dataframe\nprint(df.describe())", "type": "tool_call" }, "event_type": { @@ -16020,9 +15826,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()" + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/6TpkUAo0inflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" }, - "call_id": "ae9d3d8c-ece8-4f94-aa92-a6a93b08b43e", + "call_id": "fa1b393f-3fc7-416f-98ab-05d879def880", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -16067,13 +15873,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 214 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 224 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -16101,7 +15923,7 @@ "data": { "event": { "delta": { - "text": "This", + "text": "I", "type": "text" }, "event_type": { @@ -16121,7 +15943,7 @@ "data": { "event": { "delta": { - "text": " code will create a line plot of the average yearly inflation over time. The", + "text": "'m unable to access the file you provided", "type": "text" }, "event_type": { @@ -16141,7 +15963,7 @@ "data": { "event": { "delta": { - "text": " x-axis represents the year and the y", + "text": ". However, I", "type": "text" }, "event_type": { @@ -16161,7 +15983,7 @@ "data": { "event": { "delta": { - "text": "-axis represents the average inflation. Each point on the plot represents", + "text": " can suggest how you can describe the CSV file using the pandas library in Python.\n\nYou can use the `head()`, `dtypes`, and `describe()` functions to get an overview of the CSV file", "type": "text" }, "event_type": { @@ -16181,7 +16003,7 @@ "data": { "event": { "delta": { - "text": " the average inflation for a particular year.\n\nPlease note that you need", + "text": ".\n\n- `head()`: This function prints the first few rows of the", "type": "text" }, "event_type": { @@ -16201,7 +16023,7 @@ "data": { "event": { "delta": { - "text": " to replace 'inflation.csv' with the actual path", + "text": " dataframe, giving you an idea of what the", "type": "text" }, "event_type": { @@ -16221,7 +16043,7 @@ "data": { "event": { "delta": { - "text": " to your csv file. Also, this code assumes that the csv file", + "text": " data looks like.\n- `dtypes`: This", "type": "text" }, "event_type": { @@ -16241,7 +16063,7 @@ "data": { "event": { "delta": { - "text": " has a column named 'date' and another column named 'inflation", + "text": " function prints the data types of each column in the", "type": "text" }, "event_type": { @@ -16261,7 +16083,7 @@ "data": { "event": { "delta": { - "text": "'. If your csv file has different column names", + "text": " dataframe.\n- `describe()`: This function prints summary", "type": "text" }, "event_type": { @@ -16281,7 +16103,7 @@ "data": { "event": { "delta": { - "text": ", you need to replace 'date' and 'inflation'", + "text": " statistics of the dataframe, including mean, standard deviation, minimum, maximum,", "type": "text" }, "event_type": { @@ -16301,7 +16123,107 @@ "data": { "event": { "delta": { - "text": " with the actual column names.", + "text": " and quartiles for numeric columns, and count and unique values for", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " object columns.\n\nIf you want to get more information about the CSV file,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you can use the `info()` function, which prints a concise summary", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " of the dataframe, including the index dtype and column dtypes, non-", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "nullable values, and memory usage.\n\nPlease make sure the file is in the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct format and is accessible to the Python script.", "type": "text" }, "event_type": { @@ -16342,7 +16264,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(code_interpreter.get_file_path(\\\"\"))\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -16370,13 +16292,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16395,13 +16312,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\n", - "type": "tool_call" + "text": " error message indicates that the `bwrap.core` module is not found", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16420,13 +16332,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "df = pd.read_csv('inflation.csv')\n\n", - "type": "tool_call" + "text": ". This is likely because the `bwrap` library is not installed", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16445,13 +16352,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Convert 'date' column to datetime\ndf['date']", - "type": "tool_call" + "text": ". To fix this, you can install the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16470,13 +16372,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " = pd.to_datetime(df['date'])\n\n#", - "type": "tool_call" + "text": " `bwrap` library using pip:\n\n```\npip install", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16495,13 +16392,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Group by year and calculate average inflation\naverage_inflation = df", - "type": "tool_call" + "text": " bwrap\n```\n\nIf you are still facing issues", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16520,13 +16412,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".groupby(df['date'].dt.year)['inflation'].mean()\n\n#", - "type": "tool_call" + "text": ", you can try to use the `code_interpreter.get_file_path", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16545,13 +16432,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Plot the time series\nplt.figure(figsize=(10,6))\nplt", - "type": "tool_call" + "text": "()` function to load the CSV file directly, as shown in the corrected", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16570,13 +16452,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".plot(average_inflation.index, average_inflation.values, marker='o", - "type": "tool_call" + "text": " code above.\n\nAlternatively, if you don't have access to the `code", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16595,13 +16472,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\n", - "type": "tool_call" + "text": "_interpreter` library, you can use the `pandas.read_csv", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16620,13 +16492,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()", - "type": "tool_call" + "text": "()` function with the file path as a string:\n\n```\ndf = pd", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16645,23 +16512,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "91ad7e4c-2e89-4cb5-9d0b-753ceafb7eab", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": ".read_csv(\"/var/folders/cz/vyh7y1d11", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16669,11 +16521,67 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "xg881lsxsshnc5c0000gn/T/tmp4ed", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "7p2bg/Csr659svinflation.csv\")\n```\n\nThis", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " should load the CSV file and allow you to inspect its contents.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -16705,7 +16613,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can describe the csv file:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data('inflation.csv')\\n\\n# Print summary of the data\\nprint(df.head()) # Print the first few rows of the data\\nprint(df.info()) # Print information about the data\\nprint(df.describe()) # Print summary statistics about the data\\n```\\n\\nPlease replace 'inflation.csv' with your actual csv file name. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nAlso, make sure that the file is in the correct format and that the pandas library can read it correctly.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -16733,8 +16641,13 @@ "data": { "event": { "delta": { - "text": "This", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16753,8 +16666,13 @@ "data": { "event": { "delta": { - "text": " code will create a line plot of the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16773,8 +16691,13 @@ "data": { "event": { "delta": { - "text": " average yearly inflation over time. The x", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\ndf = pd.read_csv(code_interpreter.get_file_path(\"/var", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16793,8 +16716,13 @@ "data": { "event": { "delta": { - "text": "-axis represents the year and the y-axis represents the average inflation", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/folders/cz/vyh7y1", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16813,8 +16741,13 @@ "data": { "event": { "delta": { - "text": ". The plot will also include a title, labels for the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "d11xg881lsxsshnc5c0000gn/T", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16833,8 +16766,13 @@ "data": { "event": { "delta": { - "text": " x and y axes, and a grid to make it easier", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmp4ed7p2bg/Csr659svinflation.csv\"))\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16853,8 +16791,13 @@ "data": { "event": { "delta": { - "text": " to read.\n\nPlease replace \"inflation.csv\" with your", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Print the first few rows of the dataframe\nprint(df.head", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16873,8 +16816,13 @@ "data": { "event": { "delta": { - "text": " actual csv file name. \n\nAlso, make sure that the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16893,8 +16841,13 @@ "data": { "event": { "delta": { - "text": " is in the correct format and that the pandas library can read it", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Print the summary statistics of the dataframe\nprint(df.describe())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16913,8 +16866,23 @@ "data": { "event": { "delta": { - "text": " correctly. \n\nIf your csv file has a different column name for", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(code_interpreter.get_file_path(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/Csr659svinflation.csv\"))\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + }, + "call_id": "c5d0fce3-d7c6-4da1-89e4-e727df42f356", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16922,7 +16890,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -16933,13 +16905,42 @@ "data": { "event": { "delta": { - "text": " the date, you will need to replace", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -16953,8 +16954,13 @@ "data": { "event": { "delta": { - "text": " 'date' with the actual column name. \n\nIf your csv", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16973,8 +16979,13 @@ "data": { "event": { "delta": { - "text": " file has a different column name for the inflation, you will need", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/c", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -16993,8 +17004,13 @@ "data": { "event": { "delta": { - "text": " to replace 'inflation' with the actual column name. \n\n", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "z/vyh7y1d11xg881lsxsshnc5", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17013,8 +17029,13 @@ "data": { "event": { "delta": { - "text": "If you want to save the plot to a file instead of displaying", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "c0000gn/T/tmpe8u6r9sz/R", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17033,8 +17054,13 @@ "data": { "event": { "delta": { - "text": " it, you can use the `savefig` method. For", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "ChoI8s0inflation.csv\")\nprint(df.head())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17053,8 +17079,23 @@ "data": { "event": { "delta": { - "text": " example:\n\n```\nplt.savefig('average_inflation.png')\n```", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpe8u6r9sz/RChoI8s0inflation.csv\")\nprint(df.head())" + }, + "call_id": "35e85870-f8f3-44f4-8879-e7b02a2805f6", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17062,7 +17103,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -17090,55 +17135,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "2Yx8i0id", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:51.132007+00:00", - "__module__": "datetime" - }, - "trace_id": "N2BeNv66RcO7NRuE", - "type": "metric", - "unit": "tokens", - "value": 666 + "unit": null, + "value": 37 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "2Yx8i0id", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:51.132048+00:00", - "__module__": "datetime" - }, - "trace_id": "N2BeNv66RcO7NRuE", - "type": "metric", - "unit": "tokens", - "value": 200 + "unit": null, + "value": 10 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "2Yx8i0id", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:51.132054+00:00", - "__module__": "datetime" - }, - "trace_id": "N2BeNv66RcO7NRuE", - "type": "metric", - "unit": "tokens", - "value": 866 + "unit": null, + "value": 47 } ] } @@ -17146,7 +17155,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can describe the csv file:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data('inflation.csv')\\n\\n# Print summary of the data\\nprint(df.head()) # Print the first few rows of the data\\nprint(df.info()) # Print information about the data\\nprint(df.describe()) # Print summary statistics about the data\\n```\\n\\nPlease replace 'inflation.csv' with your actual csv file name. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nAlso, make sure that the file is in the correct format and that the pandas library can read it correctly.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nYear Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec\\n0 2014 1.6 1.6 1.7 1.8 2.0 1.9 1.9 1.7 1.7 1.8 1.7 1.6\\n1 2015 1.6 1.7 1.8 1.8 1.7 1.8 1.8 1.8 1.9 1.9 2.0 2.1\\n2 2016 2.2 2.3 2.2 2.1 2.2 2.2 2.2 2.3 2.2 2.1 2.1 2.2\\n3 2017 2.3 2.2 2.0 1.9 1.7 1.7 1.7 1.7 1.7 1.8 1.7 1.8\\n4 2018 1.8 1.8 2.1 2.1 2.2 2.3 2.4 2.2 2.2 2.1 2.2 2.2\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -17174,13 +17183,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17199,13 +17203,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load", - "type": "tool_call" + "text": " csv file contains a table with 12 columns (Jan to", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17224,13 +17223,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data\ndf = pd.read_csv(\"", - "type": "tool_call" + "text": " Dec) and 5 rows (2014 to", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17249,13 +17243,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "inflation.csv\")\n\n# Convert date column to datetime\ndf", - "type": "tool_call" + "text": " 2018). The values in the table represent the inflation rate", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17274,13 +17263,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "['date'] = pd.to_datetime(df['date'])\n\n# Group", - "type": "tool_call" + "text": " for each month of the year from 2014", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17299,13 +17283,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " by year and calculate average inflation\naverage_inflation = df.groupby", - "type": "tool_call" + "text": " to 2018.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17324,43 +17303,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(df['date'].dt.year)['inflation'].mean()\n\n#", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 469 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 61 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 530 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Plot average yearly inflation as a time series\nplt.figure(figsize=(", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -17374,13 +17368,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "10,6))\nplt.plot(average_in", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17399,13 +17388,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation.index, average_inflation.values, marker='o')\nplt", - "type": "tool_call" + "text": " error message indicates that there is an issue with", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17424,13 +17408,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\n", - "type": "tool_call" + "text": " the import statement. However, the code provided does not contain any", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17449,13 +17428,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show", - "type": "tool_call" + "text": " import statements that would cause this error.\n\nTo provide a more accurate", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17474,13 +17448,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "()", - "type": "tool_call" + "text": " answer, I would need to know the contents of the CSV file", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17499,23 +17468,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"inflation.csv\")\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "cfae3ff5-49f8-439d-b740-603bc93fb5a3", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " or more information about the error message.\n\nHowever, based on the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -17523,11 +17477,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -17538,94 +17488,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " code provided, it seems like the code is trying to load a", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "JNrmlTTc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:39.920493+00:00", - "__module__": "datetime" - }, - "trace_id": "N2BeNv66RcO7NRuE", - "type": "metric", - "unit": "tokens", - "value": 476 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "JNrmlTTc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:39.920519+00:00", - "__module__": "datetime" - }, - "trace_id": "N2BeNv66RcO7NRuE", - "type": "metric", - "unit": "tokens", - "value": 10 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "JNrmlTTc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:39.920522+00:00", - "__module__": "datetime" - }, - "trace_id": "N2BeNv66RcO7NRuE", - "type": "metric", - "unit": "tokens", - "value": 486 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " CSV file and print some basic information about it. If the file", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -17639,7 +17528,7 @@ "data": { "event": { "delta": { - "text": "It", + "text": " is not found or there is an issue with the file path,", "type": "text" }, "event_type": { @@ -17659,7 +17548,7 @@ "data": { "event": { "delta": { - "text": " seems that the file \"/var/folders", + "text": " this could cause an error.\n\nHere is a", "type": "text" }, "event_type": { @@ -17679,7 +17568,7 @@ "data": { "event": { "delta": { - "text": "/cz/vyh7y1d11xg881", + "text": " revised version of the code that includes some error", "type": "text" }, "event_type": { @@ -17699,7 +17588,7 @@ "data": { "event": { "delta": { - "text": "lsxsshnc5c0000gn/T/tmp4ed7", + "text": " handling:\n\n```\nimport pandas as pd\nimport code_interpreter", "type": "text" }, "event_type": { @@ -17719,7 +17608,7 @@ "data": { "event": { "delta": { - "text": "p2bg/UZ0Z335vinflation.csv\" does", + "text": "\n\ntry:\n # Load the CSV file", "type": "text" }, "event_type": { @@ -17739,7 +17628,7 @@ "data": { "event": { "delta": { - "text": " not exist. \n\nTo describe the csv file, you need to", + "text": "\n df = pd.read_csv(\"/var/folders/cz", "type": "text" }, "event_type": { @@ -17759,7 +17648,7 @@ "data": { "event": { "delta": { - "text": " provide the actual file path or the file itself. If the file", + "text": "/vyh7y1d11xg", "type": "text" }, "event_type": { @@ -17779,7 +17668,7 @@ "data": { "event": { "delta": { - "text": " is too large to be uploaded, you can provide a sample", + "text": "881lsxsshnc5", "type": "text" }, "event_type": { @@ -17799,7 +17688,7 @@ "data": { "event": { "delta": { - "text": " of the csv file and I can help you describe it. \n\nHere is", + "text": "c0000gn/T/tmpflpgiagc/", "type": "text" }, "event_type": { @@ -17819,7 +17708,7 @@ "data": { "event": { "delta": { - "text": " an example of how you can describe a", + "text": "8S20Zj2Oinflation.csv\")\n\n ", "type": "text" }, "event_type": { @@ -17839,7 +17728,7 @@ "data": { "event": { "delta": { - "text": " csv file using pandas:\n\n```\nimport pandas as pd\n#", + "text": " # Print the first few rows of the dataframe\n print(df.head", "type": "text" }, "event_type": { @@ -17859,7 +17748,7 @@ "data": { "event": { "delta": { - "text": " Load data\ndf = pd.read_csv('", + "text": "())\n\n # Print the data types of each column\n print", "type": "text" }, "event_type": { @@ -17879,7 +17768,7 @@ "data": { "event": { "delta": { - "text": "inflation.csv')\n# Print the first 5 rows of the", + "text": "(df.dtypes)\n\n # Print the", "type": "text" }, "event_type": { @@ -17899,7 +17788,7 @@ "data": { "event": { "delta": { - "text": " data\nprint(df.head())\n# Print the last 5 rows of the", + "text": " summary statistics of the dataframe\n ", "type": "text" }, "event_type": { @@ -17919,7 +17808,7 @@ "data": { "event": { "delta": { - "text": " data\nprint(df.tail())\n# Print the summary statistics of the data\n", + "text": " print(df.describe())\n\nexcept FileNotFoundError:\n print(\"The file", "type": "text" }, "event_type": { @@ -17939,7 +17828,7 @@ "data": { "event": { "delta": { - "text": "print(df.describe())\n# Print the data types of each column\nprint(df", + "text": " was not found.\")\nexcept pd.errors.EmptyDataError", "type": "text" }, "event_type": { @@ -17959,7 +17848,7 @@ "data": { "event": { "delta": { - "text": ".dtypes)\n# Print the number of missing values in each column\nprint", + "text": ":\n print(\"The file is empty.\")\nexcept pd.errors.ParserError", "type": "text" }, "event_type": { @@ -17979,7 +17868,7 @@ "data": { "event": { "delta": { - "text": "(df.isnull().sum())\n```\n\nThis will give you an idea of", + "text": ":\n print(\"An error occurred while parsing the", "type": "text" }, "event_type": { @@ -17999,7 +17888,7 @@ "data": { "event": { "delta": { - "text": " what the csv file contains.", + "text": " file.\")\nexcept Exception as e:\n print", "type": "text" }, "event_type": { @@ -18019,42 +17908,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "(\"An error occurred: \", str(e))\n``", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "`\n\nThis code will catch specific exceptions that could occur when loading the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -18068,13 +17948,28 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": " CSV file and print a more", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " informative error message.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18087,6 +17982,71 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 393 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 331 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 724 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport code_interpreter\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print the summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 142, in \\n line 23, in \\n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\\nImportError: attempted relative import with no known parent package\\n[/stderr]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -18096,9 +18056,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "started" }, - "tool_call": "import pandas as pd\n# Load data\ndf = pd.read", + "tool_call": "", "type": "tool_call" }, "event_type": { @@ -18123,7 +18083,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_csv(\"/var/folders/cz/vyh7y1d", + "tool_call": "import pandas as pd\nimport code_interpreter\n\n", "type": "tool_call" }, "event_type": { @@ -18148,7 +18108,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "11xg881lsxsshnc", + "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var/f", "type": "tool_call" }, "event_type": { @@ -18173,7 +18133,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "5c0000gn/T/tmp4ed7p2bg/U", + "tool_call": "olders/cz/vyh7y1d11xg881lsx", "type": "tool_call" }, "event_type": { @@ -18198,7 +18158,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "Z0Z335vinflation.csv\")\n# Rows\nprint(\"", + "tool_call": "sshnc5c0000gn/T/tmpfl", "type": "tool_call" }, "event_type": { @@ -18223,7 +18183,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "Number of rows and columns in the data:\", df.shape)\n# Columns", + "tool_call": "pgiagc/8S20Zj2Oinflation", "type": "tool_call" }, "event_type": { @@ -18248,7 +18208,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\nprint(\"Columns of the data are:\", len(df.columns))\n# Column", + "tool_call": ".csv\")\n\n# Print the first few rows of the", "type": "tool_call" }, "event_type": { @@ -18273,7 +18233,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " names\nprint(\"Columns of the data are:\", df.columns)\n# Column", + "tool_call": " dataframe\nprint(df.head())\n\n# Print the data types of each", "type": "tool_call" }, "event_type": { @@ -18298,7 +18258,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " dtypes\nprint(\"Datatype of the columns are:\", df.dtypes", + "tool_call": " column\nprint(df.dtypes)\n\n#", "type": "tool_call" }, "event_type": { @@ -18323,7 +18283,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ")", + "tool_call": " Print the summary statistics of the dataframe\nprint(df.describe())", "type": "tool_call" }, "event_type": { @@ -18350,9 +18310,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp4ed7p2bg/UZ0Z335vinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" }, - "call_id": "98e27ff4-d4d7-4764-9213-f46bb928ec68", + "call_id": "e999a578-cbd8-4bb8-bc53-deb2fff1ffce", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -18397,13 +18357,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 215 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 225 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:961ff\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -18431,8 +18407,13 @@ "data": { "event": { "delta": { - "text": "To", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18451,8 +18432,13 @@ "data": { "event": { "delta": { - "text": " use LoRA in Torcht", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport code_interpreter\n\n# Load the", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18471,8 +18457,13 @@ "data": { "event": { "delta": { - "text": "une, you can follow these steps:\n\n1. ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " CSV file\ndf = pd.read", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18491,8 +18482,13 @@ "data": { "event": { "delta": { - "text": " Install Torchtune and its dependencies.\n2.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/var/folders/cz/vyh", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18511,8 +18507,13 @@ "data": { "event": { "delta": { - "text": " Download the Llama2 weights and tokenizer.\n3", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "7y1d11xg881lsxsshnc5c", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18531,8 +18532,13 @@ "data": { "event": { "delta": { - "text": ". Use the `lora_llama2", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0000gn/T/tmpflpgiagc/8S", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18551,8 +18557,13 @@ "data": { "event": { "delta": { - "text": "_7b`", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "20Zj2Oinflation.csv\")\n\n# Print the first", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18571,8 +18582,13 @@ "data": { "event": { "delta": { - "text": " model in Torchtune, which applies LoRA to the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " few rows of the dataframe\nprint(df.head())\n\n#", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18591,8 +18607,13 @@ "data": { "event": { "delta": { - "text": " Q and V projections", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Print the data types of each column\nprint", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18611,8 +18632,13 @@ "data": { "event": { "delta": { - "text": " by default.\n4. Set the `lora_at", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df.dtypes)\n\n# Print the summary statistics of the dataframe", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18631,8 +18657,13 @@ "data": { "event": { "delta": { - "text": "tn_modules` argument to apply LoRA to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nprint(df.describe())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18651,8 +18682,23 @@ "data": { "event": { "delta": { - "text": " all linear layers in the self-attention.\n5. Increase", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())" + }, + "call_id": "ea72d524-2d0f-4220-a898-4c295315235e", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -18660,6 +18706,75 @@ "value": "progress" }, "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, "stop_reason": null }, "metrics": null @@ -18671,7 +18786,7 @@ "data": { "event": { "delta": { - "text": " the rank and alpha values to experiment with different LoRA configurations", + "text": "This", "type": "text" }, "event_type": { @@ -18691,7 +18806,7 @@ "data": { "event": { "delta": { - "text": ".\n6. Use the `lora", + "text": " code will create a line plot of", "type": "text" }, "event_type": { @@ -18711,7 +18826,7 @@ "data": { "event": { "delta": { - "text": "_finetune_distributed` recipe in Torcht", + "text": " the average yearly inflation over time. The x", "type": "text" }, "event_type": { @@ -18731,7 +18846,7 @@ "data": { "event": { "delta": { - "text": "une to run a LoRA finetune with two", + "text": "-axis represents the year and the y-axis represents the", "type": "text" }, "event_type": { @@ -18751,7 +18866,7 @@ "data": { "event": { "delta": { - "text": " GPUs.\n7. Modify the", + "text": " average inflation. Each point on the plot represents", "type": "text" }, "event_type": { @@ -18771,7 +18886,7 @@ "data": { "event": { "delta": { - "text": " `lora_finetune_distributed` config", + "text": " the average inflation for a particular year.\n\nPlease note that you", "type": "text" }, "event_type": { @@ -18791,7 +18906,7 @@ "data": { "event": { "delta": { - "text": " to apply LoRA to all linear layers in the self-", + "text": " need to replace 'inflation.csv'", "type": "text" }, "event_type": { @@ -18811,7 +18926,7 @@ "data": { "event": { "delta": { - "text": "attention and increase the rank and alpha values.\n8. Run", + "text": " with the actual path to your csv file. Also,", "type": "text" }, "event_type": { @@ -18831,7 +18946,7 @@ "data": { "event": { "delta": { - "text": " the experiment using the modified config.\n\nBy", + "text": " this code assumes that the 'date' column in your csv", "type": "text" }, "event_type": { @@ -18851,7 +18966,7 @@ "data": { "event": { "delta": { - "text": " following these steps, you can use LoRA", + "text": " file is in a format that can be parsed by pandas' `to", "type": "text" }, "event_type": { @@ -18871,7 +18986,7 @@ "data": { "event": { "delta": { - "text": " in Torchtune to fine-tune a Llama2 model", + "text": "_datetime` function. If the date is in a different", "type": "text" }, "event_type": { @@ -18891,7 +19006,7 @@ "data": { "event": { "delta": { - "text": " with a low memory footprint and experiment with different LoRA configurations", + "text": " format, you may need to specify the format using the `format", "type": "text" }, "event_type": { @@ -18911,7 +19026,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": "` parameter of `to_datetime`.", "type": "text" }, "event_type": { @@ -18952,7 +19067,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is in your current directory, you can use the following code:\\n\\n```python\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the dataframe\\nprint(df.head())\\n# Print the summary of the dataframe\\nprint(df.info())\\nprint(df.describe())\\n```\\n\\nThis will print the first 5 rows of the dataframe, the summary of the dataframe (including the index dtype and column count), and the description of the dataframe (including count, mean, std, min, 25%, 50%, 75%, max for each column).\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -18980,8 +19095,13 @@ "data": { "event": { "delta": { - "text": "{\"", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19000,8 +19120,13 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19020,8 +19145,13 @@ "data": { "event": { "delta": { - "text": " {\"query\": \"How to use LoRA in Torchtune\"}}", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data\ndf = pd.read_csv('inflation.csv", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19043,15 +19173,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "How to use LoRA in Torchtune" - }, - "call_id": "0d852474-6781-48ed-b8c1-778bd0f4e7f0", - "tool_name": "knowledge_search" + "value": "in_progress" }, + "tool_call": "')\n\n# Convert 'date' column to datetime\ndf['date']", "type": "tool_call" }, "event_type": { @@ -19060,11 +19184,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -19075,42 +19195,18 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " = pd.to_datetime(df['date'])\n\n# Group by year and calculate", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -19124,8 +19220,13 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " average inflation\naverage_inflation = df.groupby(df['date'].dt", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19144,8 +19245,13 @@ "data": { "event": { "delta": { - "text": "'m ready to help you answer questions about", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".year)['inflation'].mean()\n\n# Plot", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19164,8 +19270,13 @@ "data": { "event": { "delta": { - "text": " Torchtune based on the documentation you provided. What's your", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the time series\nplt.figure(figsize=(", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19184,8 +19295,13 @@ "data": { "event": { "delta": { - "text": " first question?", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "10,6))\nplt.plot(average", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19204,42 +19320,18 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n\\n# Convert 'date' column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot the time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "` model in Torchtune, which applies LoRA to the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -19373,7 +19508,7 @@ "data": { "event": { "delta": { - "text": " Q and V projections by default.\n4. Load the base model", + "text": "This", "type": "text" }, "event_type": { @@ -19393,7 +19528,7 @@ "data": { "event": { "delta": { - "text": " weights into the LoRA model without any conversion necessary.\n5. ", + "text": " code will create a line plot of the average yearly inflation over time. The", "type": "text" }, "event_type": { @@ -19413,7 +19548,7 @@ "data": { "event": { "delta": { - "text": " Set only LoRA parameters to trainable.\n6. Run the Lo", + "text": " x-axis represents the year and the y", "type": "text" }, "event_type": { @@ -19433,7 +19568,7 @@ "data": { "event": { "delta": { - "text": "RA finetuning recipe in Torchtune with the desired configuration.\n\n", + "text": "-axis represents the average inflation. Each point on the plot represents", "type": "text" }, "event_type": { @@ -19453,7 +19588,7 @@ "data": { "event": { "delta": { - "text": "You can also experiment with different LoRA", + "text": " the average inflation for a particular year.\n\nPlease note that you need", "type": "text" }, "event_type": { @@ -19473,7 +19608,7 @@ "data": { "event": { "delta": { - "text": " configurations, such as applying LoRA to all linear layers in the", + "text": " to replace 'inflation.csv' with the actual path", "type": "text" }, "event_type": { @@ -19493,7 +19628,7 @@ "data": { "event": { "delta": { - "text": " self-attention, increasing the rank, or scaling alpha", + "text": " to your csv file. Also, this code assumes that the csv file", "type": "text" }, "event_type": { @@ -19513,7 +19648,7 @@ "data": { "event": { "delta": { - "text": " and rank together.\n\nNote that LoRA can be beneficial for reducing memory usage", + "text": " has a column named 'date' and another column named 'inflation", "type": "text" }, "event_type": { @@ -19533,7 +19668,7 @@ "data": { "event": { "delta": { - "text": " during fine-tuning, but it may also impact model performance. You", + "text": "'. If your csv file has different column names", "type": "text" }, "event_type": { @@ -19553,7 +19688,7 @@ "data": { "event": { "delta": { - "text": " can trade off memory and model performance by adjusting the LoRA configuration and", + "text": ", you need to replace 'date' and 'inflation'", "type": "text" }, "event_type": { @@ -19573,7 +19708,7 @@ "data": { "event": { "delta": { - "text": " running experiments with different settings.", + "text": " with the actual column names.", "type": "text" }, "event_type": { @@ -19614,7 +19749,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If the file is too large to be uploaded, you can provide a sample of the csv file and I can help you describe it. \\n\\nHere is an example of how you can describe a csv file using pandas:\\n\\n```\\nimport pandas as pd\\n# Load data\\ndf = pd.read_csv('inflation.csv')\\n# Print the first 5 rows of the data\\nprint(df.head())\\n# Print the last 5 rows of the data\\nprint(df.tail())\\n# Print the summary statistics of the data\\nprint(df.describe())\\n# Print the data types of each column\\nprint(df.dtypes)\\n# Print the number of missing values in each column\\nprint(df.isnull().sum())\\n```\\n\\nThis will give you an idea of what the csv file contains.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -19642,8 +19777,13 @@ "data": { "event": { "delta": { - "text": "{\"", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19662,8 +19802,13 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\":", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19682,8 +19827,13 @@ "data": { "event": { "delta": { - "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df = pd.read_csv('inflation.csv')\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19702,8 +19852,13 @@ "data": { "event": { "delta": { - "text": " to use LoRA in Torchtune\"}}", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Convert 'date' column to datetime\ndf['date']", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19725,15 +19880,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "How to use LoRA in Torchtune" - }, - "call_id": "6070c836-0c9c-4f87-ba52-d9bf9ed44195", - "tool_name": "knowledge_search" + "value": "in_progress" }, + "tool_call": " = pd.to_datetime(df['date'])\n\n#", "type": "tool_call" }, "event_type": { @@ -19742,11 +19891,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -19757,42 +19902,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Group by year and calculate average inflation\naverage_inflation = df", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".groupby(df['date'].dt.year)['inflation'].mean()\n\n#", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -19806,8 +19952,13 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Plot the time series\nplt.figure(figsize=(10,6))\nplt", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19826,8 +19977,13 @@ "data": { "event": { "delta": { - "text": "'m ready to help you answer questions about Tor", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".plot(average_inflation.index, average_inflation.values, marker='o", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19846,8 +20002,13 @@ "data": { "event": { "delta": { - "text": "chtune based on the documentation you provided", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19866,8 +20027,13 @@ "data": { "event": { "delta": { - "text": ". What's your first question?", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -19886,13 +20052,28 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv('inflation.csv')\n\n# Convert 'date' column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot the time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "91ad7e4c-2e89-4cb5-9d0b-753ceafb7eab", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, "stop_reason": { @@ -19903,12 +20084,7 @@ }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can describe the csv file:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data('inflation.csv')\\n\\n# Print summary of the data\\nprint(df.head()) # Print the first few rows of the data\\nprint(df.info()) # Print information about the data\\nprint(df.describe()) # Print summary statistics about the data\\n```\\n\\nPlease replace 'inflation.csv' with your actual csv file name. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nAlso, make sure that the file is in the correct format and that the pandas library can read it correctly.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "To", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -19955,7 +20140,7 @@ "data": { "event": { "delta": { - "text": " use LoRA in Torchtune, you can", + "text": "This", "type": "text" }, "event_type": { @@ -19975,7 +20160,7 @@ "data": { "event": { "delta": { - "text": " follow these steps:\n\n1. Install", + "text": " code will create a line plot of the", "type": "text" }, "event_type": { @@ -19995,7 +20180,7 @@ "data": { "event": { "delta": { - "text": " Torchtune and its dependencies.\n2", + "text": " average yearly inflation over time. The x", "type": "text" }, "event_type": { @@ -20015,7 +20200,7 @@ "data": { "event": { "delta": { - "text": ". Download the Llama2 weights and tokenizer.\n3.", + "text": "-axis represents the year and the y-axis represents the average inflation", "type": "text" }, "event_type": { @@ -20035,7 +20220,7 @@ "data": { "event": { "delta": { - "text": " Use the `lora_llama2_7b`", + "text": ". The plot will also include a title, labels for the", "type": "text" }, "event_type": { @@ -20055,7 +20240,7 @@ "data": { "event": { "delta": { - "text": " model in Torchtune, which applies", + "text": " x and y axes, and a grid to make it easier", "type": "text" }, "event_type": { @@ -20075,7 +20260,7 @@ "data": { "event": { "delta": { - "text": " LoRA to the Q and V projections by default.\n4.", + "text": " to read.\n\nPlease replace \"inflation.csv\" with your", "type": "text" }, "event_type": { @@ -20095,7 +20280,7 @@ "data": { "event": { "delta": { - "text": " Load the base model weights into the LoRA model without any", + "text": " actual csv file name. \n\nAlso, make sure that the file", "type": "text" }, "event_type": { @@ -20115,7 +20300,7 @@ "data": { "event": { "delta": { - "text": " conversion necessary.\n5. Set only LoRA parameters to", + "text": " is in the correct format and that the pandas library can read it", "type": "text" }, "event_type": { @@ -20135,7 +20320,7 @@ "data": { "event": { "delta": { - "text": " trainable.\n6. Run the LoRA finetuning recipe", + "text": " correctly. \n\nIf your csv file has a different column name for", "type": "text" }, "event_type": { @@ -20155,7 +20340,7 @@ "data": { "event": { "delta": { - "text": " in Torchtune with the desired configuration.\n\nYou can also experiment", + "text": " the date, you will need to replace", "type": "text" }, "event_type": { @@ -20175,7 +20360,7 @@ "data": { "event": { "delta": { - "text": " with different LoRA configurations, such as applying LoRA to all", + "text": " 'date' with the actual column name. \n\nIf your csv", "type": "text" }, "event_type": { @@ -20195,7 +20380,7 @@ "data": { "event": { "delta": { - "text": " linear layers in the self-attention, increasing the rank, or", + "text": " file has a different column name for the inflation, you will need", "type": "text" }, "event_type": { @@ -20215,7 +20400,7 @@ "data": { "event": { "delta": { - "text": " scaling alpha and rank together.\n\nBy following these steps,", + "text": " to replace 'inflation' with the actual column name. \n\n", "type": "text" }, "event_type": { @@ -20235,7 +20420,7 @@ "data": { "event": { "delta": { - "text": " you can use LoRA in Torchtune to", + "text": "If you want to save the plot to a file instead of displaying", "type": "text" }, "event_type": { @@ -20255,7 +20440,7 @@ "data": { "event": { "delta": { - "text": " fine-tune a Llama2 model with a", + "text": " it, you can use the `savefig` method. For", "type": "text" }, "event_type": { @@ -20275,7 +20460,7 @@ "data": { "event": { "delta": { - "text": " low memory footprint and achieve good performance.", + "text": " example:\n\n```\nplt.savefig('average_inflation.png')\n```", "type": "text" }, "event_type": { @@ -20317,16 +20502,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "fHo5RmyV", + "span_id": "2Yx8i0id", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:04.498360+00:00", + "__datetime__": "2025-03-06T04:47:51.132007+00:00", "__module__": "datetime" }, - "trace_id": "NIVx0ka-TmKDiZaU", + "trace_id": "N2BeNv66RcO7NRuE", "type": "metric", "unit": "tokens", - "value": 158 + "value": 666 }, { "attributes": { @@ -20334,13 +20519,13 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "fHo5RmyV", + "span_id": "2Yx8i0id", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:04.498396+00:00", + "__datetime__": "2025-03-06T04:47:51.132048+00:00", "__module__": "datetime" }, - "trace_id": "NIVx0ka-TmKDiZaU", + "trace_id": "N2BeNv66RcO7NRuE", "type": "metric", "unit": "tokens", "value": 200 @@ -20351,16 +20536,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "fHo5RmyV", + "span_id": "2Yx8i0id", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:04.498403+00:00", + "__datetime__": "2025-03-06T04:47:51.132054+00:00", "__module__": "datetime" }, - "trace_id": "NIVx0ka-TmKDiZaU", + "trace_id": "N2BeNv66RcO7NRuE", "type": "metric", "unit": "tokens", - "value": 358 + "value": 866 } ] } @@ -20368,7 +20553,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"It seems that the file \\\"\" does not exist. \\n\\nTo describe the csv file, you need to provide the actual file path or the file itself. If you are using a local file, you can use the `load_data` function from the `code_interpreter` library to load the file. \\n\\nHere is an example of how you can describe the csv file:\\n\\n```\\nimport pandas as pd\\nfrom code_interpreter import load_data\\n\\n# Load data\\ndf = load_data('inflation.csv')\\n\\n# Print summary of the data\\nprint(df.head()) # Print the first few rows of the data\\nprint(df.info()) # Print information about the data\\nprint(df.describe()) # Print summary statistics about the data\\n```\\n\\nPlease replace 'inflation.csv' with your actual csv file name. \\n\\nIf you are using a remote file, you need to provide the actual file path or the file itself. \\n\\nAlso, make sure that the file is in the correct format and that the pandas library can read it correctly.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -20396,8 +20581,13 @@ "data": { "event": { "delta": { - "text": "{\"", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20416,8 +20606,13 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20436,8 +20631,13 @@ "data": { "event": { "delta": { - "text": "name\": \"knowledge_search\", \"parameters", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data\ndf = pd.read_csv(\"", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20456,8 +20656,13 @@ "data": { "event": { "delta": { - "text": "\": {\"query\": \"How to use LoRA in Torcht", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "inflation.csv\")\n\n# Convert date column to datetime\ndf", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20476,8 +20681,13 @@ "data": { "event": { "delta": { - "text": "une\"}}", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "['date'] = pd.to_datetime(df['date'])\n\n# Group", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20499,15 +20709,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "How to use LoRA in Torchtune" - }, - "call_id": "7815c1ab-fbdf-42e8-84a7-b1f74f67d863", - "tool_name": "knowledge_search" + "value": "in_progress" }, + "tool_call": " by year and calculate average inflation\naverage_inflation = df.groupby", "type": "tool_call" }, "event_type": { @@ -20516,11 +20720,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -20531,94 +20731,68 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(df['date'].dt.year)['inflation'].mean()\n\n#", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "KM-vILDG", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:01.270069+00:00", - "__module__": "datetime" + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" }, - "trace_id": "NIVx0ka-TmKDiZaU", - "type": "metric", - "unit": "tokens", - "value": 117 + "tool_call": " Plot average yearly inflation as a time series\nplt.figure(figsize=(", + "type": "tool_call" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "KM-vILDG", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:01.270143+00:00", - "__module__": "datetime" - }, - "trace_id": "NIVx0ka-TmKDiZaU", - "type": "metric", - "unit": "tokens", - "value": 40 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "KM-vILDG", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:01.270151+00:00", - "__module__": "datetime" - }, - "trace_id": "NIVx0ka-TmKDiZaU", - "type": "metric", - "unit": "tokens", - "value": 157 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "10,6))\nplt.plot(average_in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -20632,8 +20806,13 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.index, average_inflation.values, marker='o')\nplt", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20652,8 +20831,13 @@ "data": { "event": { "delta": { - "text": "'m ready to help you answer questions about Torcht", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20672,8 +20856,13 @@ "data": { "event": { "delta": { - "text": "une based on the documentation you provided. What's your first", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20692,8 +20881,13 @@ "data": { "event": { "delta": { - "text": " question?", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "()", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20706,6 +20900,45 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"inflation.csv\")\n\n# Convert date column to datetime\ndf['date'] = pd.to_datetime(df['date'])\n\n# Group by year and calculate average inflation\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\n\n# Plot average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Average Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "cfae3ff5-49f8-439d-b740-603bc93fb5a3", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -20734,16 +20967,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "5yc3Hts6", + "span_id": "JNrmlTTc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-06T04:48:59.857021+00:00", + "__datetime__": "2025-03-06T04:47:39.920493+00:00", "__module__": "datetime" }, - "trace_id": "6KRztpbwTwquLEUn", + "trace_id": "N2BeNv66RcO7NRuE", "type": "metric", "unit": "tokens", - "value": 75 + "value": 476 }, { "attributes": { @@ -20751,16 +20984,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "5yc3Hts6", + "span_id": "JNrmlTTc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-06T04:48:59.857048+00:00", + "__datetime__": "2025-03-06T04:47:39.920519+00:00", "__module__": "datetime" }, - "trace_id": "6KRztpbwTwquLEUn", + "trace_id": "N2BeNv66RcO7NRuE", "type": "metric", "unit": "tokens", - "value": 35 + "value": 10 }, { "attributes": { @@ -20768,16 +21001,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "5yc3Hts6", + "span_id": "JNrmlTTc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-06T04:48:59.857055+00:00", + "__datetime__": "2025-03-06T04:47:39.920522+00:00", "__module__": "datetime" }, - "trace_id": "6KRztpbwTwquLEUn", + "trace_id": "N2BeNv66RcO7NRuE", "type": "metric", "unit": "tokens", - "value": 110 + "value": 486 } ] } @@ -20785,7 +21018,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -20813,7 +21046,7 @@ "data": { "event": { "delta": { - "text": "To", + "text": "This", "type": "text" }, "event_type": { @@ -20833,7 +21066,7 @@ "data": { "event": { "delta": { - "text": " use LoRA in Torchtune, you can follow these steps", + "text": " code will create a line plot of the average", "type": "text" }, "event_type": { @@ -20853,7 +21086,7 @@ "data": { "event": { "delta": { - "text": ":\n\n1. Install Torchtune and its dependencies.\n2", + "text": " yearly inflation over time. The x-axis represents", "type": "text" }, "event_type": { @@ -20873,7 +21106,7 @@ "data": { "event": { "delta": { - "text": ". Download the Llama2 weights and tokenizer.\n3. ", + "text": " the year, and the y-axis represents the average", "type": "text" }, "event_type": { @@ -20893,7 +21126,7 @@ "data": { "event": { "delta": { - "text": " Use the `lora_llama2_7b` model in", + "text": " yearly inflation. The plot will show the trend of average yearly inflation", "type": "text" }, "event_type": { @@ -20913,7 +21146,7 @@ "data": { "event": { "delta": { - "text": " Torchtune, which applies LoRA to the Q and V projections", + "text": " over the years.", "type": "text" }, "event_type": { @@ -20933,13 +21166,58 @@ "data": { "event": { "delta": { - "text": " by default.\n4. Load the base model weights into the Lo", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 633 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 689 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -20953,8 +21231,13 @@ "data": { "event": { "delta": { - "text": "RA model without any conversion necessary.\n5.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20973,8 +21256,13 @@ "data": { "event": { "delta": { - "text": " Set only LoRA parameters to trainable.\n6. Run", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -20993,8 +21281,13 @@ "data": { "event": { "delta": { - "text": " the LoRA finetuning recipe in Torchtune with the desired", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " as plt\n\n# Load data\ndf = pd.read_csv(\"/", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21013,8 +21306,13 @@ "data": { "event": { "delta": { - "text": " configuration.\n\nYou can also experiment with different LoRA configurations, such as", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "var/folders/cz/vyh7y1d11", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21033,8 +21331,13 @@ "data": { "event": { "delta": { - "text": " applying LoRA to all linear layers in the self-attention, increasing", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "xg881lsxsshnc5c0000gn/T", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21053,8 +21356,13 @@ "data": { "event": { "delta": { - "text": " the rank, or scaling alpha and rank together.\n\nBy following these steps", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmp_d_cdeif/UuctHlJzinflation.csv\")\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21073,8 +21381,13 @@ "data": { "event": { "delta": { - "text": ", you can use LoRA in Torcht", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Calculate average yearly inflation\ndf['Average", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21093,8 +21406,13 @@ "data": { "event": { "delta": { - "text": "une to fine-tune a Llama2 model with a low", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'] = df[['Jan', 'Feb', 'Mar',", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21113,8 +21431,13 @@ "data": { "event": { "delta": { - "text": " memory footprint and achieve good performance.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Apr', 'May', 'Jun',", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21133,42 +21456,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Jul', 'Aug', 'Sep', 'Oct', '", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Nov', 'Dec']].mean(axis=1)\n\n# Plot time series", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -21182,8 +21506,13 @@ "data": { "event": { "delta": { - "text": "{\"", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\nplt.figure(figsize=(10,6))\nplt.plot(df['Year", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21202,8 +21531,13 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\":", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21222,8 +21556,13 @@ "data": { "event": { "delta": { - "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Yearly Inflation')\nplt.title('Average Yearly Inflation Over", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21242,8 +21581,13 @@ "data": { "event": { "delta": { - "text": " use LoRA in Torchtune\"}}", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Time')\nplt.grid(True)\nplt.show()", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21269,10 +21613,14 @@ }, "tool_call": { "arguments": { - "query": "How to use LoRA in Torchtune" + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/UuctHlJzinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()" }, - "call_id": "45ec3014-ff3f-4d0b-9649-30a299f7b9d4", - "tool_name": "knowledge_search" + "call_id": "f953fd92-9413-4968-9ffa-f85ddea173dc", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } }, "type": "tool_call" }, @@ -21312,13 +21660,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 453 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 463 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -21346,7 +21710,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "This", "type": "text" }, "event_type": { @@ -21366,7 +21730,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help you answer", + "text": " code will create a line plot of the average yearly inflation over time. The", "type": "text" }, "event_type": { @@ -21386,7 +21750,7 @@ "data": { "event": { "delta": { - "text": " questions about Torchtune based on the documentation you provided.", + "text": " x-axis represents the year, and the y-axis represents the average yearly inflation", "type": "text" }, "event_type": { @@ -21406,7 +21770,27 @@ "data": { "event": { "delta": { - "text": " What's your first question?", + "text": ". The plot will show the trend of average yearly inflation over the years", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", "type": "text" }, "event_type": { @@ -21441,13 +21825,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 635 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 691 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nIt appears that this CSV file contains monthly inflation rates for different years. The 'Year' column represents the year, and the rest of the columns represent the inflation rates for each month of the year.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -21475,88 +21875,13 @@ "data": { "event": { "delta": { - "text": "To", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " use LoRA in Torchtune, you can follow these steps:\n\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "1. Install Torchtune and its dependencies", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".\n2. Download the Llama2 weights and", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " tokenizer.\n3. Construct a", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21575,8 +21900,13 @@ "data": { "event": { "delta": { - "text": " Llama2 model with LoRA layers using `lora_ll", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21595,8 +21925,13 @@ "data": { "event": { "delta": { - "text": "ama2_7b`.\n4.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"/var/f", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21615,8 +21950,13 @@ "data": { "event": { "delta": { - "text": " Load the base model weights into the LoRA model without", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "olders/cz/vyh7y1d11xg881lsx", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21635,8 +21975,13 @@ "data": { "event": { "delta": { - "text": " any conversion necessary.\n5. Set only LoRA parameters to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "sshnc5c0000gn/T/tmpflpgiagc/", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21655,8 +22000,13 @@ "data": { "event": { "delta": { - "text": " trainable.\n6. Run a LoRA finetune using", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21675,8 +22025,13 @@ "data": { "event": { "delta": { - "text": " Torchtune's `LoRA recipe`.\n\nYou can also experiment", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\ndf['Average'] = df[['Jan', 'Feb', '", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21695,8 +22050,13 @@ "data": { "event": { "delta": { - "text": " with different LoRA configurations, such as applying LoRA to all", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Mar', 'Apr', 'May', 'Jun', 'Jul',", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21715,8 +22075,13 @@ "data": { "event": { "delta": { - "text": " linear layers in the self-attention,", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Aug', 'Sep', 'Oct', 'Nov', 'Dec", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21735,8 +22100,13 @@ "data": { "event": { "delta": { - "text": " increasing the rank to 16 or ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21755,8 +22125,13 @@ "data": { "event": { "delta": { - "text": "32, and scaling alpha and rank together.\n\nNote that LoRA", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "=(10,6))\nplt.plot(df['Year'], df['Average", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21775,8 +22150,13 @@ "data": { "event": { "delta": { - "text": " can be beneficial for reducing memory usage during fine-tuning", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21795,8 +22175,13 @@ "data": { "event": { "delta": { - "text": ", but it may also impact model performance", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Inflation')\nplt.title('Average Yearly Inflation Over", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21815,8 +22200,13 @@ "data": { "event": { "delta": { - "text": ". You can trade off memory and model", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Time')\nplt.grid(True)\nplt.show()", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21835,8 +22225,23 @@ "data": { "event": { "delta": { - "text": " performance by adjusting the LoRA configuration.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f82fa3fd-e3be-4cb7-9298-8b4625cf709e", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -21844,7 +22249,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -21872,55 +22281,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "BHazvRV1", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:10.165627+00:00", - "__module__": "datetime" - }, - "trace_id": "1NwedpozRqOVQXRs", - "type": "metric", - "unit": "tokens", - "value": 158 + "unit": null, + "value": 454 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "BHazvRV1", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:10.165662+00:00", - "__module__": "datetime" - }, - "trace_id": "1NwedpozRqOVQXRs", - "type": "metric", - "unit": "tokens", - "value": 202 + "unit": null, + "value": 10 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "BHazvRV1", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:10.165670+00:00", - "__module__": "datetime" - }, - "trace_id": "1NwedpozRqOVQXRs", - "type": "metric", - "unit": "tokens", - "value": 360 + "unit": null, + "value": 464 } ] } @@ -21928,7 +22301,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nThe 'Year' column likely contains the year for which the inflation rates are given. The other columns ('Jan' to 'Dec') likely contain the inflation rates for each month of the year.\\n\\nPlease note that the actual data in the CSV file is not provided, so the above description is based on the structure of the file.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -21956,7 +22329,7 @@ "data": { "event": { "delta": { - "text": "{\"", + "text": "This", "type": "text" }, "event_type": { @@ -21976,7 +22349,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\": \"knowledge_search\", \"", + "text": " code will create a line plot of the average yearly inflation over", "type": "text" }, "event_type": { @@ -21996,7 +22369,7 @@ "data": { "event": { "delta": { - "text": "parameters\": {\"query\": \"How to use LoRA in Tor", + "text": " time. The x-axis represents the year and the y-axis", "type": "text" }, "event_type": { @@ -22016,7 +22389,7 @@ "data": { "event": { "delta": { - "text": "chtune\"}}", + "text": " represents the average yearly inflation. The plot will show the trend", "type": "text" }, "event_type": { @@ -22036,19 +22409,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "How to use LoRA in Torchtune" - }, - "call_id": "c92271a7-37e2-4396-aa7f-5805b9273a71", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " of average yearly inflation over the years.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -22056,11 +22418,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -22088,55 +22446,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "Z6HS-lIg", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:08.648346+00:00", - "__module__": "datetime" - }, - "trace_id": "1NwedpozRqOVQXRs", - "type": "metric", - "unit": "tokens", - "value": 117 + "unit": null, + "value": 661 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "Z6HS-lIg", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:08.648375+00:00", - "__module__": "datetime" - }, - "trace_id": "1NwedpozRqOVQXRs", - "type": "metric", - "unit": "tokens", - "value": 40 + "unit": null, + "value": 55 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "Z6HS-lIg", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:08.648382+00:00", - "__module__": "datetime" - }, - "trace_id": "1NwedpozRqOVQXRs", - "type": "metric", - "unit": "tokens", - "value": 157 + "unit": null, + "value": 716 } ] } @@ -22144,7 +22466,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"This CSV file contains 10 rows and 13 columns. The columns are named 'Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'. The data types of these columns are int64 for 'Year' and float64 for the rest.\\n\\nThe 'Year' column likely contains the year for which the inflation rates are given. The other columns ('Jan' to 'Dec') likely contain the inflation rates for each month of the year.\\n\\nPlease note that the actual data in the CSV file is not provided, so the above description is based on the structure of the file.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -22172,8 +22494,13 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -22192,8 +22519,13 @@ "data": { "event": { "delta": { - "text": "'m ready to help you answer questions about", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -22212,8 +22544,13 @@ "data": { "event": { "delta": { - "text": " Torchtune based on the documentation you provided. What's your", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "df = pd.read_csv(\"/var/folders/cz/vyh7", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -22232,8 +22569,13 @@ "data": { "event": { "delta": { - "text": " first question?", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y1d11xg881lsxsshnc5c0000", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -22252,94 +22594,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T/tmpfsp7c9_g/Aih5TPOuin", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "o33PSCts", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:07.268876+00:00", - "__module__": "datetime" - }, - "trace_id": "edTwKHK5Q4K8yCqt", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "o33PSCts", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:07.268906+00:00", - "__module__": "datetime" - }, - "trace_id": "edTwKHK5Q4K8yCqt", - "type": "metric", - "unit": "tokens", - "value": 35 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "o33PSCts", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:49:07.268914+00:00", - "__module__": "datetime" - }, - "trace_id": "edTwKHK5Q4K8yCqt", - "type": "metric", - "unit": "tokens", - "value": 110 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " the self-attention.\n5. Increase the `l", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -22593,7 +22973,7 @@ "data": { "event": { "delta": { - "text": "ora_rank` and `lora_alpha` arguments to improve model", + "text": "It", "type": "text" }, "event_type": { @@ -22613,7 +22993,7 @@ "data": { "event": { "delta": { - "text": " performance.\n6. Run the LoRA finetuning recipe", + "text": " seems that the file \"/var/folders", "type": "text" }, "event_type": { @@ -22633,7 +23013,7 @@ "data": { "event": { "delta": { - "text": " in Torchtune using the `lora_finetune", + "text": "/cz/vyh7y1d11xg881", "type": "text" }, "event_type": { @@ -22653,7 +23033,7 @@ "data": { "event": { "delta": { - "text": "_distributed` command.\n\nBy following", + "text": "lsxsshnc5c0000gn/T/tmp4ed7", "type": "text" }, "event_type": { @@ -22673,7 +23053,7 @@ "data": { "event": { "delta": { - "text": " these steps, you can apply Lo", + "text": "p2bg/UZ0Z335vinflation.csv\" does", "type": "text" }, "event_type": { @@ -22693,7 +23073,7 @@ "data": { "event": { "delta": { - "text": "RA to your Llama2 model and fine-tune it", + "text": " not exist. \n\nTo describe the csv file, you need to", "type": "text" }, "event_type": { @@ -22713,7 +23093,7 @@ "data": { "event": { "delta": { - "text": " using Torchtune.", + "text": " provide the actual file path or the file itself. If the file", "type": "text" }, "event_type": { @@ -22733,42 +23113,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " is too large to be uploaded, you can provide a sample", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " of the csv file and I can help you describe it. \n\nHere is", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -22782,7 +23153,7 @@ "data": { "event": { "delta": { - "text": "{\"", + "text": " an example of how you can describe a", "type": "text" }, "event_type": { @@ -22802,7 +23173,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\": \"", + "text": " csv file using pandas:\n\n```\nimport pandas as pd\n#", "type": "text" }, "event_type": { @@ -22822,7 +23193,7 @@ "data": { "event": { "delta": { - "text": "knowledge_search\", \"parameters\": {\"query", + "text": " Load data\ndf = pd.read_csv('", "type": "text" }, "event_type": { @@ -22842,7 +23213,7 @@ "data": { "event": { "delta": { - "text": "\": \"How to use LoRA in Torchtune\"}}", + "text": "inflation.csv')\n# Print the first 5 rows of the", "type": "text" }, "event_type": { @@ -22862,19 +23233,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "How to use LoRA in Torchtune" - }, - "call_id": "3f9aaa8a-ca61-4a51-830a-e9920d3d8ec5", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " data\nprint(df.head())\n# Print the last 5 rows of the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -22882,11 +23242,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -22897,42 +23253,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " data\nprint(df.tail())\n# Print the summary statistics of the data\n", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "print(df.describe())\n# Print the data types of each column\nprint(df", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -22946,7 +23293,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": ".dtypes)\n# Print the number of missing values in each column\nprint", "type": "text" }, "event_type": { @@ -22966,7 +23313,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help you answer questions about Torchtune based on the", + "text": "(df.isnull().sum())\n```\n\nThis will give you an idea of", "type": "text" }, "event_type": { @@ -22986,7 +23333,7 @@ "data": { "event": { "delta": { - "text": " documentation you provided. What's your first question?", + "text": " what the csv file contains.", "type": "text" }, "event_type": { @@ -23027,7 +23374,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -23055,13 +23402,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "This", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23080,13 +23422,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters", - "type": "tool_call" + "text": " CSV file contains 10 rows and 13 columns. The columns", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23105,13 +23442,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\": {\"query\": \"Torchtune documentation\"}}", - "type": "tool_call" + "text": " are named 'Year', 'Jan', 'Feb', 'Mar',", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23130,19 +23462,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Torchtune documentation" - }, - "call_id": "5cfa4683-2147-41ab-9a44-a8b7f23e9f75", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " 'Apr', 'May', 'Jun', 'Jul', 'Aug", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23150,11 +23471,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -23165,42 +23482,13 @@ "data": { "event": { "delta": { - "text": "", + "text": "', 'Sep', 'Oct', 'Nov', '", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -23214,7 +23502,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "Dec'. The data types of these columns are int64 for '", "type": "text" }, "event_type": { @@ -23234,7 +23522,7 @@ "data": { "event": { "delta": { - "text": " attention type used by Llama3-8B is grouped-query attention.", + "text": "Year' and float64 for the rest.\n\nIt appears that this CSV", "type": "text" }, "event_type": { @@ -23254,42 +23542,13 @@ "data": { "event": { "delta": { - "text": "", + "text": " file contains monthly inflation rates for different years. The 'Year' column", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -23303,7 +23562,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " represents the year, and the rest of the columns represent the inflation rates", "type": "text" }, "event_type": { @@ -23323,7 +23582,7 @@ "data": { "event": { "delta": { - "text": " attention type used by Llama3", + "text": " for each month of the year", "type": "text" }, "event_type": { @@ -23343,7 +23602,7 @@ "data": { "event": { "delta": { - "text": "-8B is grouped-query attention.", + "text": ".", "type": "text" }, "event_type": { @@ -23378,13 +23637,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 326 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 125 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 451 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -23412,28 +23687,13 @@ "data": { "event": { "delta": { - "text": "{\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " \"type\": \"function\",\n ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23452,8 +23712,13 @@ "data": { "event": { "delta": { - "text": " \"name\": \"knowledge_search\",\n ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23472,8 +23737,13 @@ "data": { "event": { "delta": { - "text": " \"parameters\": {\n \"query", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/folders/cz/vyh7y1d11xg881lsx", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23492,8 +23762,13 @@ "data": { "event": { "delta": { - "text": "\": \"Llama3-8B attention type\"\n }\n}", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "sshnc5c0000gn/T/tmp_d_cdeif/Uuct", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -23515,15 +23790,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "b2d62231-df92-43ed-b51f-f7b8a4bc4b15", - "tool_name": "knowledge_search" + "value": "in_progress" }, + "tool_call": "HlJzinflation.csv\")\n# Rows\nprint(\"Number of rows", "type": "tool_call" }, "event_type": { @@ -23532,59 +23801,6 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, "stop_reason": null }, "metrics": null @@ -23599,9 +23815,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "started" + "value": "in_progress" }, - "tool_call": "", + "tool_call": " and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns", "type": "tool_call" }, "event_type": { @@ -23626,7 +23842,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "{\"type\": \"function\", \"name\":", + "tool_call": " of the data are:\", len(df.columns))\n# Column names\nprint(\"", "type": "tool_call" }, "event_type": { @@ -23651,7 +23867,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " \"knowledge_search\", \"parameters\": {\"query\": \"Llama", + "tool_call": "Columns of the data are:\", df.columns)\n# Column dtypes\nprint", "type": "tool_call" }, "event_type": { @@ -23676,7 +23892,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "3-8B attention type\"}}", + "tool_call": "(\"Datatype of the columns are:\", df.dtypes)", "type": "tool_call" }, "event_type": { @@ -23703,10 +23919,14 @@ }, "tool_call": { "arguments": { - "query": "Llama3-8B attention type" + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmp_d_cdeif/UuctHlJzinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" }, - "call_id": "52c2b1ea-3695-4030-87a1-d0ca6d1056af", - "tool_name": "knowledge_search" + "call_id": "479e0208-711f-4318-b284-745599a9fb9c", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } }, "type": "tool_call" }, @@ -23746,13 +23966,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 36 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 46 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) - Investopedia\\\", \\\"url\\\": \\\"https://www.investopedia.com/terms/m/mark-zuckerberg.asp\\\", \\\"content\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg is a self-taught computer programmer and co-founder, chair, and chief executive officer of Meta (META), formerly known as Facebook. Mark Zuckerberg is a self-taught computer programmer and the co-founder, chair, and CEO of Meta (formerly Facebook). In April 2018, Zuckerberg testified on Capitol Hill about Facebook's use of users' information, including the sharing of 87 million users' information to Cambridge Analytica. Technically, Mark Zuckerberg makes a salary of $1 a year at Facebook. Booker Join With Facebook Founder and CEO Mark Zuckerberg to Advance a National Model for Improving Public Schools.\\\\\\\"\\\", \\\"score\\\": 0.74697095, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Forbes\\\", \\\"url\\\": \\\"https://www.forbes.com/profile/mark-zuckerberg/\\\", \\\"content\\\": \\\"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\\\"Challah Horse\\\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\\\", \\\"score\\\": 0.6410185, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\nNumber of rows and columns in the data: (10, 13)\\nColumns of the data are: 13\\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\\n 'Oct', 'Nov', 'Dec'],\\n dtype='object')\\nDatatype of the columns are: Year int64\\nJan float64\\nFeb float64\\nMar float64\\nApr float64\\nMay float64\\nJun float64\\nJul float64\\nAug float64\\nSep float64\\nOct float64\\nNov float64\\nDec float64\\ndtype: object\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -23780,7 +24016,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "This", "type": "text" }, "event_type": { @@ -23800,7 +24036,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", + "text": " CSV file contains 10 rows and 13 columns. The columns are", "type": "text" }, "event_type": { @@ -23820,94 +24056,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "LWwngTMJ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:24.889991+00:00", - "__module__": "datetime" - }, - "trace_id": "K0psyd28TdSkb8LK", - "type": "metric", - "unit": "tokens", - "value": 1203 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "LWwngTMJ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:24.890015+00:00", - "__module__": "datetime" - }, - "trace_id": "K0psyd28TdSkb8LK", - "type": "metric", - "unit": "tokens", - "value": 19 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.1-8B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "LWwngTMJ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-06T04:47:24.890017+00:00", - "__module__": "datetime" - }, - "trace_id": "K0psyd28TdSkb8LK", - "type": "metric", - "unit": "tokens", - "value": 1222 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Forbes\\\", \\\"url\\\": \\\"https://www.forbes.com/profile/mark-zuckerberg/\\\", \\\"content\\\": \\\"Meta has donated $1 million to President-elect Donald Trump's inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark\\\", \\\"score\\\": 0.6701125, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"META | Meta Platforms Inc. Company Profile & Executives - WSJ\\\", \\\"url\\\": \\\"https://www.wsj.com/market-data/quotes/META/company-people\\\", \\\"content\\\": \\\"Company profile for Meta Platforms Inc. including key executives, insider trading, ownership, revenue and average growth rates. View detailed META description & address.\\\", \\\"score\\\": 0.23361932, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " named 'Year', 'Jan', 'Feb', 'Mar', '", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -23921,7 +24076,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "Apr', 'May', 'Jun', 'Jul', 'Aug',", "type": "text" }, "event_type": { @@ -23941,7 +24096,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is not explicitly stated in", + "text": " 'Sep', 'Oct', 'Nov', 'Dec'. The data", "type": "text" }, "event_type": { @@ -23961,7 +24116,7 @@ "data": { "event": { "delta": { - "text": " the search results. However, Mark Zuckerberg is mentioned as the CEO", + "text": " types of these columns are int64 for 'Year", "type": "text" }, "event_type": { @@ -23981,7 +24136,7 @@ "data": { "event": { "delta": { - "text": " of Meta in some of the search results, but it is not clear", + "text": "' and float64 for the rest.\n\nIt appears that this CSV file", "type": "text" }, "event_type": { @@ -24001,7 +24156,7 @@ "data": { "event": { "delta": { - "text": " if he is still the current CEO.", + "text": " contains monthly inflation rates for different years. The 'Year' column represents", "type": "text" }, "event_type": { @@ -24021,42 +24176,13 @@ "data": { "event": { "delta": { - "text": "", + "text": " the year, and the rest of the columns represent the inflation rates", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\\\", \\\"url\\\": \\\"https://observer.com/2024/01/meta-facebook-top-executives/\\\", \\\"content\\\": \\\"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\\\", \\\"score\\\": 0.45536873, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company.\\\", \\\"score\\\": 0.21026355, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -24070,7 +24196,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " for each month of the", "type": "text" }, "event_type": { @@ -24090,7 +24216,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", + "text": " year.", "type": "text" }, "event_type": { @@ -24125,13 +24251,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 327 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 125 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 452 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -24189,7 +24331,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "brave_search.call(query=\"current CEO of Meta\")", + "tool_call": "import pandas as pd\n# Load data\ndf = pd.read", "type": "tool_call" }, "event_type": { @@ -24212,19 +24354,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "current CEO of Meta" - }, - "call_id": "cc85a2df-6b2d-41c0-97dd-1509ca8061c4", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "brave_search" - } + "value": "in_progress" }, + "tool_call": "_csv(\"/var/folders/cz/vyh7", "type": "tool_call" }, "event_type": { @@ -24233,11 +24365,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -24248,42 +24376,43 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "y1d11xg881lsxsshnc5c000", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0gn/T/tmpflpgiagc/2VkeqrPlinflation", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -24297,8 +24426,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".csv\")\n# Rows\nprint(\"Number of rows and columns in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24317,8 +24451,13 @@ "data": { "event": { "delta": { - "text": " function `get_boiling_point` is not able to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24337,8 +24476,13 @@ "data": { "event": { "delta": { - "text": " find the boiling point of polyjuice as", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of the data", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24357,8 +24501,13 @@ "data": { "event": { "delta": { - "text": " it is a fictional liquid from the Harry Potter series. The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24377,8 +24526,13 @@ "data": { "event": { "delta": { - "text": " function is only able to find the boiling point of real liquids.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the columns are:\", df.dtypes)", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24391,6 +24545,45 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)" + }, + "call_id": "b8aab119-7997-428e-81ab-e6aa163f7acc", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -24412,13 +24605,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 36 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 46 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:e40e6\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "lora_llama2_7b` model in Torcht", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -24575,7 +24775,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "une, which applies LoRA to the Q and V projections by", "type": "text" }, "event_type": { @@ -24595,7 +24795,7 @@ "data": { "event": { "delta": { - "text": " function `get_boiling_point` is not able to find the", + "text": " default.\n4. Load the", "type": "text" }, "event_type": { @@ -24615,7 +24815,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice as it is not a real liquid", + "text": " base model weights into the LoRA model without", "type": "text" }, "event_type": { @@ -24635,7 +24835,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": " any conversion necessary.\n5. Set only LoRA parameters", "type": "text" }, "event_type": { @@ -24655,42 +24855,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " to trainable.\n6. Run the LoRA", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " finetuning recipe in Torchtune with the desired configuration.\n\n", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -24704,13 +24895,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "You can also experiment with different LoRA configurations, such as applying Lo", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24729,13 +24915,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", - "type": "tool_call" + "text": "RA to all linear layers in the self-attention", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24754,13 +24935,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"poly", - "type": "tool_call" + "text": ", increasing the rank, or scaling alpha and rank together.\n\n", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24779,13 +24955,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "juice\"}}", - "type": "tool_call" + "text": "Note that LoRA can be beneficial for", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24804,19 +24975,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "liquid_name": "polyjuice" - }, - "call_id": "83d9f330-4c7a-4dd3-8fcb-ccc5301c1f83", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " reducing memory usage during fine-tuning, but it may also", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24824,11 +24984,47 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " impact model performance. You can trade off memory and model performance by", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " adjusting the LoRA configuration and running experiments with different settings.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -24854,13 +25050,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 158 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 212 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 370 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -24888,13 +25100,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "{\"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24913,13 +25120,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "{\"type\": \"function\", \"name\":", - "type": "tool_call" + "text": "type\": \"function\", \"name\": \"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24938,13 +25140,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " \"get_boiling_point\", \"parameters\": {\"liquid_name\":", - "type": "tool_call" + "text": "knowledge_search\", \"parameters\": {\"query\": \"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24963,13 +25160,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " \"polyjuice\"}}", - "type": "tool_call" + "text": "How to use LoRA in Torchtune\"}}", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -24995,10 +25187,10 @@ }, "tool_call": { "arguments": { - "liquid_name": "polyjuice" + "query": "How to use LoRA in Torchtune" }, - "call_id": "98c63572-06c8-4cc0-a14e-3b10fb9ddc19", - "tool_name": "get_boiling_point" + "call_id": "6ee142d9-1a65-433e-a681-f20066a2e1f7", + "tool_name": "knowledge_search" }, "type": "tool_call" }, @@ -25038,13 +25230,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:02bc2\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:e40e6\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:200a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -25092,7 +25300,7 @@ "data": { "event": { "delta": { - "text": " couldn't find any information on the boiling point of Polyjuice", + "text": "'m ready to help you answer questions about Torcht", "type": "text" }, "event_type": { @@ -25112,7 +25320,7 @@ "data": { "event": { "delta": { - "text": ". Polyjuice is a magical potion in the Harry Potter series", + "text": "une based on the documentation you provided.", "type": "text" }, "event_type": { @@ -25132,7 +25340,7 @@ "data": { "event": { "delta": { - "text": " that allows the drinker to transform into someone else. It's", + "text": " What's your first question?", "type": "text" }, "event_type": { @@ -25152,33 +25360,58 @@ "data": { "event": { "delta": { - "text": " not a physical substance with a boiling point. If you have any", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:cc255\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " the `lora_attn_modules` argument to apply LoRA to", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -25425,7 +25605,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " all linear layers in the self-attention.\n5. ", "type": "text" }, "event_type": { @@ -25445,7 +25625,7 @@ "data": { "event": { "delta": { - "text": " 100th prime number is 541.", + "text": " Increase the rank and alpha values to experiment with different LoRA configurations.\n", "type": "text" }, "event_type": { @@ -25465,42 +25645,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "6. Run the LoRA finetuning", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " recipe in Torchtune using the `lora_finetune", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -25514,13 +25685,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "_distributed` command.\n7. Monitor the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25539,13 +25705,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "def is_prime(n):\n if n <= 1:\n ", - "type": "tool_call" + "text": " loss curves and adjust the LoRA configuration as needed to trade off", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25564,13 +25725,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return False\n if n <= 3:\n return True", - "type": "tool_call" + "text": " memory and model performance.\n\nBy following these steps, you can effectively use", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25589,13 +25745,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i +", - "type": "tool_call" + "text": " LoRA in Torchtune to fine-tune", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25614,13 +25765,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 2) == 0:\n return False\n ", - "type": "tool_call" + "text": " Llama2 models with a low memory footprint.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25639,18 +25785,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " i += 6\n return True\n\ndef get_nth_prime(n):\n count", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 158 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 206 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 364 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -25664,13 +25850,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " = 0\n num = 2\n while True:\n if", - "type": "tool_call" + "text": "{\"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25689,13 +25870,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " is_prime(num):\n count += 1", - "type": "tool_call" + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25714,13 +25890,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n if count == n:\n return num\n num += ", - "type": "tool_call" + "text": "\": {\"query\": \"How to use LoRA in Torchtune", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25739,13 +25910,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "1\n\nprint(get_nth_prime(100))", - "type": "tool_call" + "text": "\"}}", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -25771,14 +25937,10 @@ }, "tool_call": { "arguments": { - "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef get_nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(get_nth_prime(100))" + "query": "How to use LoRA in Torchtune" }, - "call_id": "7fca0515-82f3-46e1-bbec-eceb8fa5162e", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } + "call_id": "a7b02498-0a50-40c2-abf2-563d4d26d01f", + "tool_name": "knowledge_search" }, "type": "tool_call" }, @@ -25818,13 +25980,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:16a6a\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:cc255\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:7a06a\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -25852,7 +26030,7 @@ "data": { "event": { "delta": { - "text": "Per", + "text": "I", "type": "text" }, "event_type": { @@ -25872,7 +26050,7 @@ "data": { "event": { "delta": { - "text": "plexity the company was founded in 2022", + "text": "'m ready to help you answer questions about Torchtune based on the", "type": "text" }, "event_type": { @@ -25892,7 +26070,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": " documentation you provided. What's your first question?", "type": "text" }, "event_type": { @@ -25927,13 +26105,29 @@ "value": "end_of_turn" } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:961ff\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": ".\n6. Use the `lora", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -26309,7 +26435,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "_finetune_distributed` recipe in Torcht", "type": "text" }, "event_type": { @@ -26329,7 +26455,7 @@ "data": { "event": { "delta": { - "text": " NBA was created on August 3, 1949, with the", + "text": "une to run a LoRA finetune with two", "type": "text" }, "event_type": { @@ -26349,7 +26475,7 @@ "data": { "event": { "delta": { - "text": " merger of the Basketball Association of America (BAA) and the National", + "text": " GPUs.\n7. Modify the", "type": "text" }, "event_type": { @@ -26369,7 +26495,7 @@ "data": { "event": { "delta": { - "text": " Basketball League (NBL).", + "text": " `lora_finetune_distributed` config", "type": "text" }, "event_type": { @@ -26389,42 +26515,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " to apply LoRA to all linear layers in the self-", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "attention and increase the rank and alpha values.\n8. Run", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -26438,7 +26555,7 @@ "data": { "event": { "delta": { - "text": "{\"", + "text": " the experiment using the modified config.\n\nBy", "type": "text" }, "event_type": { @@ -26458,7 +26575,7 @@ "data": { "event": { "delta": { - "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "text": " following these steps, you can use LoRA", "type": "text" }, "event_type": { @@ -26478,7 +26595,7 @@ "data": { "event": { "delta": { - "text": "\": {\"query\": \"when was the nba created\"}}", + "text": " in Torchtune to fine-tune a Llama2 model", "type": "text" }, "event_type": { @@ -26498,19 +26615,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "when was the nba created" - }, - "call_id": "7b01a40d-a6a8-4c86-b91d-1790e7480e57", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " with a low memory footprint and experiment with different LoRA configurations", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -26518,11 +26624,27 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -26554,7 +26676,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -26582,13 +26704,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "{\"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -26607,13 +26724,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",", - "type": "tool_call" + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters\":", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -26632,13 +26744,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " \"parameters\": {\"query\": \"when was the nba created", - "type": "tool_call" + "text": " {\"query\": \"How to use LoRA in Torchtune\"}}", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -26660,9 +26767,15 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "0d852474-6781-48ed-b8c1-778bd0f4e7f0", + "tool_name": "knowledge_search" }, - "tool_call": "\"}}", "type": "tool_call" }, "event_type": { @@ -26671,7 +26784,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -26682,24 +26799,13 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "when was the nba created" - }, - "call_id": "bbaf750a-0337-4c83-9bf2-76c2f72d45c3", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, "stop_reason": { @@ -26710,6 +26816,111 @@ }, "metrics": null } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:24443\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:961ff\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:b49f7\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'m ready to help you answer questions about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Torchtune based on the documentation you provided. What's your", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " first question?", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } }, { "__module__": "llama_stack.apis.inference.inference", @@ -26738,7 +26949,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:d4e29\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " self-attention, increasing the rank, or scaling alpha", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -27051,7 +27237,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": " and rank together.\n\nNote that LoRA can be beneficial for reducing memory usage", "type": "text" }, "event_type": { @@ -27071,7 +27257,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": " during fine-tuning, but it may also impact model performance. You", "type": "text" }, "event_type": { @@ -27091,7 +27277,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": " can trade off memory and model performance by adjusting the LoRA configuration and", "type": "text" }, "event_type": { @@ -27111,20 +27297,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " running experiments with different settings.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -27132,11 +27306,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -27162,65 +27332,13 @@ "value": "end_of_turn" } }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "YhFB39Ik", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335148+00:00", - "__module__": "datetime" - }, - "trace_id": "3n2xEtjLQt6ZGVR_", - "type": "metric", - "unit": "tokens", - "value": 267 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "YhFB39Ik", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335179+00:00", - "__module__": "datetime" - }, - "trace_id": "3n2xEtjLQt6ZGVR_", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "YhFB39Ik", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:31.335185+00:00", - "__module__": "datetime" - }, - "trace_id": "3n2xEtjLQt6ZGVR_", - "type": "metric", - "unit": "tokens", - "value": 295 - } - ] + "metrics": null } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27248,7 +27366,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "{\"", "type": "text" }, "event_type": { @@ -27268,7 +27386,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": "type\": \"function\", \"name\":", "type": "text" }, "event_type": { @@ -27288,7 +27406,27 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to use LoRA in Torchtune\"}}", "type": "text" }, "event_type": { @@ -27315,11 +27453,10 @@ }, "tool_call": { "arguments": { - "celcius": true, - "liquid_name": "polyjuice" + "query": "How to use LoRA in Torchtune" }, - "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", - "tool_name": "get_boiling_point" + "call_id": "6070c836-0c9c-4f87-ba52-d9bf9ed44195", + "tool_name": "knowledge_search" }, "type": "tool_call" }, @@ -27359,65 +27496,13 @@ "value": "end_of_turn" } }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "lnqeV_cZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708270+00:00", - "__module__": "datetime" - }, - "trace_id": "me4qbUSCQ5yKvrAG", - "type": "metric", - "unit": "tokens", - "value": 211 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "lnqeV_cZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708281+00:00", - "__module__": "datetime" - }, - "trace_id": "me4qbUSCQ5yKvrAG", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "lnqeV_cZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:29.708284+00:00", - "__module__": "datetime" - }, - "trace_id": "me4qbUSCQ5yKvrAG", - "type": "metric", - "unit": "tokens", - "value": 239 - } - ] + "metrics": null } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:2a4c4\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:d4e29\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:d68cc\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -27445,7 +27530,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "I", "type": "text" }, "event_type": { @@ -27465,7 +27550,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", + "text": "'m ready to help you answer questions about Tor", "type": "text" }, "event_type": { @@ -27485,7 +27570,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": "chtune based on the documentation you provided", "type": "text" }, "event_type": { @@ -27505,20 +27590,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": ". What's your first question?", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -27526,11 +27599,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -27556,65 +27625,13 @@ "value": "end_of_turn" } }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "TDJHPVDZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195776+00:00", - "__module__": "datetime" - }, - "trace_id": "r2GKj8iqTYaNxTeq", - "type": "metric", - "unit": "tokens", - "value": 155 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "TDJHPVDZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195808+00:00", - "__module__": "datetime" - }, - "trace_id": "r2GKj8iqTYaNxTeq", - "type": "metric", - "unit": "tokens", - "value": 28 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "TDJHPVDZ", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:28.195814+00:00", - "__module__": "datetime" - }, - "trace_id": "r2GKj8iqTYaNxTeq", - "type": "metric", - "unit": "tokens", - "value": 183 - } - ] + "metrics": null } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b299f\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28140,7 +28084,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "{\"", "type": "text" }, "event_type": { @@ -28160,7 +28104,7 @@ "data": { "event": { "delta": { - "text": " function get", + "text": "type\": \"function\", \"name\":", "type": "text" }, "event_type": { @@ -28180,7 +28124,7 @@ "data": { "event": { "delta": { - "text": "_boiling", + "text": " \"knowledge_search\", \"parameters", "type": "text" }, "event_type": { @@ -28200,7 +28144,7 @@ "data": { "event": { "delta": { - "text": "_point is", + "text": "\": {\"query\": \"How to use Lo", "type": "text" }, "event_type": { @@ -28220,7 +28164,7 @@ "data": { "event": { "delta": { - "text": " not able", + "text": "RA in Torchtune\"}}", "type": "text" }, "event_type": { @@ -28240,8 +28184,19 @@ "data": { "event": { "delta": { - "text": " to determine", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "3d9a3bd1-4a05-4feb-b5a2-eed7a7a24f1b", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -28249,6 +28204,75 @@ "value": "progress" }, "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:7bf28\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:b299f\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:af719\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, "stop_reason": null }, "metrics": null @@ -28260,7 +28284,7 @@ "data": { "event": { "delta": { - "text": " the boiling", + "text": "I", "type": "text" }, "event_type": { @@ -28280,7 +28304,7 @@ "data": { "event": { "delta": { - "text": " point of", + "text": "'m ready to help you answer questions about", "type": "text" }, "event_type": { @@ -28300,7 +28324,7 @@ "data": { "event": { "delta": { - "text": " \"polyju", + "text": " Torchtune based on the documentation you provided", "type": "text" }, "event_type": { @@ -28320,7 +28344,7 @@ "data": { "event": { "delta": { - "text": "ice\"", + "text": ". What's your first question?", "type": "text" }, "event_type": { @@ -28340,7 +28364,72 @@ "data": { "event": { "delta": { - "text": " as it", + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:13786\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " scaling alpha and rank together.\n\nBy following these steps,", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -28645,7 +28729,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " you can use LoRA in Torchtune to", "type": "text" }, "event_type": { @@ -28665,7 +28749,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": " fine-tune a Llama2 model with a", "type": "text" }, "event_type": { @@ -28685,7 +28769,7 @@ "data": { "event": { "delta": { - "text": " recognized.", + "text": " low memory footprint and achieve good performance.", "type": "text" }, "event_type": { @@ -28723,54 +28807,54 @@ "metrics": [ { "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "fHo5RmyV", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401637+00:00", + "__datetime__": "2025-03-06T04:49:04.498360+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "NIVx0ka-TmKDiZaU", "type": "metric", "unit": "tokens", - "value": 93 + "value": 158 }, { "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "fHo5RmyV", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401666+00:00", + "__datetime__": "2025-03-06T04:49:04.498396+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "NIVx0ka-TmKDiZaU", "type": "metric", "unit": "tokens", - "value": 20 + "value": 200 }, { "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "fHo5RmyV", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401670+00:00", + "__datetime__": "2025-03-06T04:49:04.498403+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "NIVx0ka-TmKDiZaU", "type": "metric", "unit": "tokens", - "value": 113 + "value": 358 } ] } @@ -28778,7 +28862,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -28806,7 +28890,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "{\"", "type": "text" }, "event_type": { @@ -28826,7 +28910,7 @@ "data": { "event": { "delta": { - "text": " function call", + "text": "type\": \"function\", \"", "type": "text" }, "event_type": { @@ -28846,7 +28930,7 @@ "data": { "event": { "delta": { - "text": " should be", + "text": "name\": \"knowledge_search\", \"parameters", "type": "text" }, "event_type": { @@ -28866,7 +28950,7 @@ "data": { "event": { "delta": { - "text": " [get", + "text": "\": {\"query\": \"How to use LoRA in Torcht", "type": "text" }, "event_type": { @@ -28886,7 +28970,7 @@ "data": { "event": { "delta": { - "text": "_boiling", + "text": "une\"}}", "type": "text" }, "event_type": { @@ -28906,8 +28990,19 @@ "data": { "event": { "delta": { - "text": "_point_with", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "7815c1ab-fbdf-42e8-84a7-b1f74f67d863", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -28915,7 +29010,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -28926,53 +29025,94 @@ "data": { "event": { "delta": { - "text": "_metadata(", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "liquid_name", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "KM-vILDG", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:01.270069+00:00", + "__module__": "datetime" + }, + "trace_id": "NIVx0ka-TmKDiZaU", + "type": "metric", + "unit": "tokens", + "value": 117 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "KM-vILDG", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:01.270143+00:00", + "__module__": "datetime" + }, + "trace_id": "NIVx0ka-TmKDiZaU", + "type": "metric", + "unit": "tokens", + "value": 40 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "KM-vILDG", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:01.270151+00:00", + "__module__": "datetime" + }, + "trace_id": "NIVx0ka-TmKDiZaU", + "type": "metric", + "unit": "tokens", + "value": 157 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c1f5\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:13786\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f9c19\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "=\"poly", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -28986,7 +29126,7 @@ "data": { "event": { "delta": { - "text": "juice", + "text": "I", "type": "text" }, "event_type": { @@ -29006,7 +29146,7 @@ "data": { "event": { "delta": { - "text": "\", cel", + "text": "'m ready to help you answer questions about Torcht", "type": "text" }, "event_type": { @@ -29026,7 +29166,7 @@ "data": { "event": { "delta": { - "text": "cius", + "text": "une based on the documentation you provided. What's your first", "type": "text" }, "event_type": { @@ -29046,7 +29186,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": " question?", "type": "text" }, "event_type": { @@ -29083,19 +29223,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 86 + "span_id": "5yc3Hts6", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:48:59.857021+00:00", + "__module__": "datetime" + }, + "trace_id": "6KRztpbwTwquLEUn", + "type": "metric", + "unit": "tokens", + "value": 75 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "5yc3Hts6", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:48:59.857048+00:00", + "__module__": "datetime" + }, + "trace_id": "6KRztpbwTwquLEUn", + "type": "metric", + "unit": "tokens", "value": 35 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 121 + "span_id": "5yc3Hts6", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:48:59.857055+00:00", + "__module__": "datetime" + }, + "trace_id": "6KRztpbwTwquLEUn", + "type": "metric", + "unit": "tokens", + "value": 110 } ] } @@ -29103,7 +29279,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -29613,7 +29676,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "{\"", "type": "text" }, "event_type": { @@ -29633,7 +29696,7 @@ "data": { "event": { "delta": { - "text": "get_bo", + "text": "type\": \"function\", \"name\":", "type": "text" }, "event_type": { @@ -29653,7 +29716,7 @@ "data": { "event": { "delta": { - "text": "iling_point", + "text": " \"knowledge_search\", \"parameters\": {\"query\": \"How to", "type": "text" }, "event_type": { @@ -29673,7 +29736,7 @@ "data": { "event": { "delta": { - "text": "_with_metadata", + "text": " use LoRA in Torchtune\"}}", "type": "text" }, "event_type": { @@ -29693,8 +29756,19 @@ "data": { "event": { "delta": { - "text": "(liquid", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "45ec3014-ff3f-4d0b-9649-30a299f7b9d4", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -29702,7 +29776,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -29713,33 +29791,42 @@ "data": { "event": { "delta": { - "text": "_name='", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "polyju", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -29753,7 +29840,7 @@ "data": { "event": { "delta": { - "text": "ice',", + "text": "I", "type": "text" }, "event_type": { @@ -29773,7 +29860,7 @@ "data": { "event": { "delta": { - "text": " celci", + "text": "'m ready to help you answer", "type": "text" }, "event_type": { @@ -29793,7 +29880,7 @@ "data": { "event": { "delta": { - "text": "us=True", + "text": " questions about Torchtune based on the documentation you provided.", "type": "text" }, "event_type": { @@ -29813,7 +29900,7 @@ "data": { "event": { "delta": { - "text": ")]", + "text": " What's your first question?", "type": "text" }, "event_type": { @@ -29827,42 +29914,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "71898ef2-4a6d-4131-ac62-281c8fb5d29c", - "tool_name": "get_boiling_point_with_metadata" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -29884,29 +29935,13 @@ "value": "end_of_turn" } }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 37 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 30 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 67 - } - ] + "metrics": null } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:1b69d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", + "text": " increasing the rank to 16 or ", "type": "text" }, "event_type": { @@ -30299,7 +30249,7 @@ "data": { "event": { "delta": { - "text": " am not able", + "text": "32, and scaling alpha and rank together.\n\nNote that LoRA", "type": "text" }, "event_type": { @@ -30319,7 +30269,7 @@ "data": { "event": { "delta": { - "text": " to execute this task as", + "text": " can be beneficial for reducing memory usage during fine-tuning", "type": "text" }, "event_type": { @@ -30339,7 +30289,7 @@ "data": { "event": { "delta": { - "text": " it exceeds the", + "text": ", but it may also impact model performance", "type": "text" }, "event_type": { @@ -30359,7 +30309,7 @@ "data": { "event": { "delta": { - "text": " limitations of the functions I", + "text": ". You can trade off memory and model", "type": "text" }, "event_type": { @@ -30379,7 +30329,7 @@ "data": { "event": { "delta": { - "text": " have been given.", + "text": " performance by adjusting the LoRA configuration.", "type": "text" }, "event_type": { @@ -30417,54 +30367,54 @@ "metrics": [ { "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "5If5go-q", + "span_id": "BHazvRV1", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__datetime__": "2025-03-06T04:49:10.165627+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "1NwedpozRqOVQXRs", "type": "metric", "unit": "tokens", - "value": 433 + "value": 158 }, { "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "5If5go-q", + "span_id": "BHazvRV1", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__datetime__": "2025-03-06T04:49:10.165662+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "1NwedpozRqOVQXRs", "type": "metric", "unit": "tokens", - "value": 31 + "value": 202 }, { "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "5If5go-q", + "span_id": "BHazvRV1", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__datetime__": "2025-03-06T04:49:10.165670+00:00", "__module__": "datetime" }, - "trace_id": "StUjhrTMQKKQSRvS", + "trace_id": "1NwedpozRqOVQXRs", "type": "metric", "unit": "tokens", - "value": 464 + "value": 360 } ] } @@ -30472,7 +30422,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -30500,7 +30450,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "{\"", "type": "text" }, "event_type": { @@ -30520,7 +30470,7 @@ "data": { "event": { "delta": { - "text": " error message", + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"", "type": "text" }, "event_type": { @@ -30540,7 +30490,7 @@ "data": { "event": { "delta": { - "text": " indicates that", + "text": "parameters\": {\"query\": \"How to use LoRA in Tor", "type": "text" }, "event_type": { @@ -30560,7 +30510,7 @@ "data": { "event": { "delta": { - "text": " the file", + "text": "chtune\"}}", "type": "text" }, "event_type": { @@ -30580,8 +30530,19 @@ "data": { "event": { "delta": { - "text": " 'b", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "c92271a7-37e2-4396-aa7f-5805b9273a71", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -30589,7 +30550,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -30600,53 +30565,94 @@ "data": { "event": { "delta": { - "text": "wrap'", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " was", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "Z6HS-lIg", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:08.648346+00:00", + "__module__": "datetime" + }, + "trace_id": "1NwedpozRqOVQXRs", + "type": "metric", + "unit": "tokens", + "value": 117 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "Z6HS-lIg", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:08.648375+00:00", + "__module__": "datetime" + }, + "trace_id": "1NwedpozRqOVQXRs", + "type": "metric", + "unit": "tokens", + "value": 40 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "Z6HS-lIg", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:08.648382+00:00", + "__module__": "datetime" + }, + "trace_id": "1NwedpozRqOVQXRs", + "type": "metric", + "unit": "tokens", + "value": 157 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b222e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:1b69d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:deca9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " not found", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -30660,7 +30666,7 @@ "data": { "event": { "delta": { - "text": ". This", + "text": "I", "type": "text" }, "event_type": { @@ -30680,7 +30686,7 @@ "data": { "event": { "delta": { - "text": " is likely", + "text": "'m ready to help you answer questions about", "type": "text" }, "event_type": { @@ -30700,7 +30706,7 @@ "data": { "event": { "delta": { - "text": " because the", + "text": " Torchtune based on the documentation you provided. What's your", "type": "text" }, "event_type": { @@ -30720,7 +30726,7 @@ "data": { "event": { "delta": { - "text": " file", + "text": " first question?", "type": "text" }, "event_type": { @@ -30740,53 +30746,94 @@ "data": { "event": { "delta": { - "text": " path provided", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is incorrect", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "o33PSCts", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:07.268876+00:00", + "__module__": "datetime" + }, + "trace_id": "edTwKHK5Q4K8yCqt", + "type": "metric", + "unit": "tokens", + "value": 75 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "o33PSCts", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:07.268906+00:00", + "__module__": "datetime" + }, + "trace_id": "edTwKHK5Q4K8yCqt", + "type": "metric", + "unit": "tokens", + "value": 35 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "o33PSCts", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:49:07.268914+00:00", + "__module__": "datetime" + }, + "trace_id": "edTwKHK5Q4K8yCqt", + "type": "metric", + "unit": "tokens", + "value": 110 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:15b86\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " and", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31260,7 +31312,7 @@ "data": { "event": { "delta": { - "text": " the", + "text": "{\"", "type": "text" }, "event_type": { @@ -31280,7 +31332,7 @@ "data": { "event": { "delta": { - "text": " file exists", + "text": "type\": \"function\", \"name\": \"knowledge_search\",", "type": "text" }, "event_type": { @@ -31300,7 +31352,7 @@ "data": { "event": { "delta": { - "text": " in the", + "text": " \"parameters\": {\"query\": \"How to use LoRA", "type": "text" }, "event_type": { @@ -31320,7 +31372,7 @@ "data": { "event": { "delta": { - "text": " specified", + "text": " in Torchtune\"}}", "type": "text" }, "event_type": { @@ -31340,8 +31392,19 @@ "data": { "event": { "delta": { - "text": " location", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "548b1430-be4a-4c22-9430-62bda6dd150c", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31349,7 +31412,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -31360,33 +31427,58 @@ "data": { "event": { "delta": { - "text": ".\n2", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:bbddb\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:15b86\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:83901\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": ". Use", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31400,7 +31492,7 @@ "data": { "event": { "delta": { - "text": " the", + "text": "I", "type": "text" }, "event_type": { @@ -31420,7 +31512,7 @@ "data": { "event": { "delta": { - "text": " correct", + "text": "'m ready to help you answer questions about Torchtune based on", "type": "text" }, "event_type": { @@ -31440,7 +31532,7 @@ "data": { "event": { "delta": { - "text": " file path", + "text": " the documentation you provided. What's your first question", "type": "text" }, "event_type": { @@ -31460,7 +31552,7 @@ "data": { "event": { "delta": { - "text": ": If", + "text": "?", "type": "text" }, "event_type": { @@ -31480,33 +31572,58 @@ "data": { "event": { "delta": { - "text": " the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:65275\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " matches the one", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -31860,7 +32002,7 @@ "data": { "event": { "delta": { - "text": " provided in", + "text": "{\"", "type": "text" }, "event_type": { @@ -31880,7 +32022,7 @@ "data": { "event": { "delta": { - "text": " the", + "text": "type\": \"function\", \"name\": \"knowledge", "type": "text" }, "event_type": { @@ -31900,7 +32042,7 @@ "data": { "event": { "delta": { - "text": " code.\n", + "text": "_search\", \"parameters\": {\"query\": \"", "type": "text" }, "event_type": { @@ -31920,7 +32062,7 @@ "data": { "event": { "delta": { - "text": "4.", + "text": "How to use LoRA in Torchtune\"}}", "type": "text" }, "event_type": { @@ -31940,8 +32082,19 @@ "data": { "event": { "delta": { - "text": " Use the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "b1a5c1c5-905e-4206-95f6-e30f9b07376d", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -31949,7 +32102,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -31960,33 +32117,58 @@ "data": { "event": { "delta": { - "text": " absolute file", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:da8ed\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:65275\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:f4ddd\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " path:", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -32000,7 +32182,7 @@ "data": { "event": { "delta": { - "text": " Instead of", + "text": "I", "type": "text" }, "event_type": { @@ -32020,7 +32202,7 @@ "data": { "event": { "delta": { - "text": " using a", + "text": "'m ready to help you answer questions about Torcht", "type": "text" }, "event_type": { @@ -32040,7 +32222,7 @@ "data": { "event": { "delta": { - "text": " relative file", + "text": "une based on the documentation you provided. What's your first", "type": "text" }, "event_type": { @@ -32060,7 +32242,7 @@ "data": { "event": { "delta": { - "text": " path,", + "text": " question?", "type": "text" }, "event_type": { @@ -32080,33 +32262,58 @@ "data": { "event": { "delta": { - "text": " try using", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:5c435\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -32565,13 +32756,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "{\"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32590,13 +32776,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas", - "type": "tool_call" + "text": "type\": \"function\", \"name\": \"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32615,13 +32796,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " as pd", - "type": "tool_call" + "text": "knowledge_search\", \"parameters\": {\"query", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32640,13 +32816,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n#", - "type": "tool_call" + "text": "\": \"How to use LoRA in Torchtune\"}}", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32668,9 +32839,15 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "3f9aaa8a-ca61-4a51-830a-e9920d3d8ec5", + "tool_name": "knowledge_search" }, - "tool_call": " Load data", "type": "tool_call" }, "event_type": { @@ -32679,7 +32856,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -32690,43 +32871,42 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\ndf", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ea3f6\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:5c435\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:91d52\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " = pd", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -32740,13 +32920,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".read_csv", - "type": "tool_call" + "text": "I", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32765,13 +32940,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(\"/var", - "type": "tool_call" + "text": "'m ready to help you answer questions about Torchtune based on the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32790,13 +32960,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/f", - "type": "tool_call" + "text": " documentation you provided. What's your first question?", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -32815,43 +32980,42 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "olders/r", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:6dc04\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "70", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -33315,13 +33414,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "y_/", - "type": "tool_call" + "text": "{\"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33340,13 +33434,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "4NO", - "type": "tool_call" + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33365,13 +33454,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0CF", - "type": "tool_call" + "text": "parameters\": {\"query\": \"How to use LoRA in", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33390,13 +33474,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "URin", - "type": "tool_call" + "text": " Torchtune\"}}", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33418,9 +33497,15 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "How to use LoRA in Torchtune" + }, + "call_id": "d4e8b8eb-a0be-4434-b270-48315bf20723", + "tool_name": "knowledge_search" }, - "tool_call": "flation.csv", "type": "tool_call" }, "event_type": { @@ -33429,7 +33514,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -33440,43 +33529,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\")\n#", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 117 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fa9cd\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:6dc04\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6f75f\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Rows\n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -33490,13 +33594,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(\"", - "type": "tool_call" + "text": "I", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33515,13 +33614,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Number of", - "type": "tool_call" + "text": "'m ready to help you answer questions about", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33540,13 +33634,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " rows and", - "type": "tool_call" + "text": " Torchtune based on the documentation you provided", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33565,13 +33654,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " columns in", - "type": "tool_call" + "text": ". What's your first question?", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33590,43 +33674,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the data", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 35 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\", df", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -33643,9 +33742,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "started" }, - "tool_call": ".shape)\n", + "tool_call": "", "type": "tool_call" }, "event_type": { @@ -33670,7 +33769,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "# Columns", + "tool_call": "{\"type\": \"function\", \"name\": \"", "type": "tool_call" }, "event_type": { @@ -33695,7 +33794,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\nprint", + "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"Tor", "type": "tool_call" }, "event_type": { @@ -33720,7 +33819,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "(\"Columns", + "tool_call": "chtune documentation\"}}", "type": "tool_call" }, "event_type": { @@ -33743,9 +33842,15 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Torchtune documentation" + }, + "call_id": "cf722fb9-6067-46ea-8534-852b7d364278", + "tool_name": "knowledge_search" }, - "tool_call": " of the", "type": "tool_call" }, "event_type": { @@ -33754,7 +33859,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -33765,43 +33874,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data are", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 39 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 49 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\", len", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -33815,13 +33939,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(df.columns", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33840,13 +33959,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "))\n#", - "type": "tool_call" + "text": " attention type used by Llama3-8B is grouped-query attention", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33865,13 +33979,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Column names", - "type": "tool_call" + "text": ".", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33890,43 +33999,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\nprint", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(\"Columns", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -33940,13 +34064,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " of the", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33965,13 +34084,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data are", - "type": "tool_call" + "text": " attention type used by Llama3-8B is grouped", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -33990,13 +34104,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\", df", - "type": "tool_call" + "text": "-query attention.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34015,43 +34124,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".columns)\n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Column", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -34065,13 +34189,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " dtypes", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34090,13 +34209,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\nprint", - "type": "tool_call" + "text": " attention type used by Llama3-8B", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34115,13 +34229,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(\"Dat", - "type": "tool_call" + "text": " is grouped-query attention.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34140,43 +34249,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "atype of", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the columns", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -34190,13 +34314,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " are:\",", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34215,13 +34334,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " df.d", - "type": "tool_call" + "text": " attention type used by Llama3-8B", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34240,13 +34354,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "types)\n", - "type": "tool_call" + "text": " is grouped-query attention.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34265,43 +34374,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Sample", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 80 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 26 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 106 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " of data", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -34315,63 +34439,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\nprint", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(\"Data", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " sample from", - "type": "tool_call" + "text": "{\n", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34390,13 +34459,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file:\")\n", - "type": "tool_call" + "text": " \"type\": \"function\",\n \"name\": \"knowledge_search\",\n", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34415,13 +34479,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(df", - "type": "tool_call" + "text": " \"parameters\": {\n \"query\": \"L", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34440,13 +34499,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".head())", - "type": "tool_call" + "text": "lama3-8B attention type\"\n }\n}", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34472,14 +34526,10 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qvqyvgyj6yjd3t4pwsy9t0rm0000gn/T/tmpxgxj70y_/4NO0CFURinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + "query": "Llama3-8B attention type" }, - "call_id": "923b4193-8e2e-4b28-b55d-3d0e6e9a3b90", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } + "call_id": "9106bccf-d0c5-4b0a-9398-0b5972ada295", + "tool_name": "knowledge_search" }, "type": "tool_call" }, @@ -34523,17 +34573,17 @@ { "metric": "prompt_tokens", "unit": null, - "value": 37 + "value": 40 }, { "metric": "completion_tokens", "unit": null, - "value": 10 + "value": 48 }, { "metric": "total_tokens", "unit": null, - "value": 47 + "value": 88 } ] } @@ -34541,7 +34591,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -34569,8 +34619,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34589,8 +34644,13 @@ "data": { "event": { "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34609,8 +34669,13 @@ "data": { "event": { "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " \"parameters\": {\"query\": \"Llama3-8", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34629,8 +34694,13 @@ "data": { "event": { "delta": { - "text": "rm0000gn/T/tmp2x_sml66/9vY", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "B attention type\"}}", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34649,8 +34719,19 @@ "data": { "event": { "delta": { - "text": "vmVRoinflation.csv\" does not exist. This could be due to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "768fe977-8297-42bd-90c3-b1dc07882ce0", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -34658,7 +34739,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -34669,33 +34754,58 @@ "data": { "event": { "delta": { - "text": " a variety of reasons such as the file being deleted, the path being incorrect", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 50 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) - Investopedia\\\", \\\"url\\\": \\\"https://www.investopedia.com/terms/m/mark-zuckerberg.asp\\\", \\\"content\\\": \\\"Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg: Founder and CEO of Meta (formerly Facebook) Mark Zuckerberg is a self-taught computer programmer and co-founder, chair, and chief executive officer of Meta (META), formerly known as Facebook. Mark Zuckerberg is a self-taught computer programmer and the co-founder, chair, and CEO of Meta (formerly Facebook). In April 2018, Zuckerberg testified on Capitol Hill about Facebook's use of users' information, including the sharing of 87 million users' information to Cambridge Analytica. Technically, Mark Zuckerberg makes a salary of $1 a year at Facebook. Booker Join With Facebook Founder and CEO Mark Zuckerberg to Advance a National Model for Improving Public Schools.\\\\\\\"\\\", \\\"score\\\": 0.74697095, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Forbes\\\", \\\"url\\\": \\\"https://www.forbes.com/profile/mark-zuckerberg/\\\", \\\"content\\\": \\\"Meta CEO Mark Zuckerberg \\\\u201cloved\\\\u201d an image on Facebook known as \\\\\\\"Challah Horse\\\\\\\" that happens to be AI-generated, highlighting the amount of AI spam on the platform. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President Elect Meta has donated $1 million to President-elect Donald Trump\\\\u2019s inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark Zuckerberg met with Trump at his Mar-a-Lago residence in an apparent bid to mend years of strained ties. ### Meta Donates $1 Million To Trump\\\\u2019s Inaugural Fund Weeks After Mark Zuckerberg Met President-Elect Read the full profile on Forbes: https://www.forbes.com/sites/kerryadolan/2023/09/26/mark-gets-meta-zuckerberg-talks-ai-and-that-musk-mma-fight-thats-never-going-to-happen/?sh=671046e73037\\\", \\\"score\\\": 0.6410185, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -34709,7 +34819,7 @@ "data": { "event": { "delta": { - "text": " try the following:\n\n1. Check the file path: Ensure that the file", + "text": "The", "type": "text" }, "event_type": { @@ -34729,7 +34839,7 @@ "data": { "event": { "delta": { - "text": " path is correct and the file exists at that location.\n2. Check file permissions:", + "text": " current CEO of Meta is Mark Zuckerberg.", "type": "text" }, "event_type": { @@ -34749,33 +34859,94 @@ "data": { "event": { "delta": { - "text": " Ensure that the file is accessible and you have the necessary permissions to", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "LWwngTMJ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:47:24.889991+00:00", + "__module__": "datetime" + }, + "trace_id": "K0psyd28TdSkb8LK", + "type": "metric", + "unit": "tokens", + "value": 1203 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "LWwngTMJ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:47:24.890015+00:00", + "__module__": "datetime" + }, + "trace_id": "K0psyd28TdSkb8LK", + "type": "metric", + "unit": "tokens", + "value": 19 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.1-8B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "LWwngTMJ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-06T04:47:24.890017+00:00", + "__module__": "datetime" + }, + "trace_id": "K0psyd28TdSkb8LK", + "type": "metric", + "unit": "tokens", + "value": 1222 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Forbes\\\", \\\"url\\\": \\\"https://www.forbes.com/profile/mark-zuckerberg/\\\", \\\"content\\\": \\\"Meta has donated $1 million to President-elect Donald Trump's inaugural fund, the company confirmed to various news outlets on Wednesday, a move that comes just weeks after its CEO Mark\\\", \\\"score\\\": 0.6701125, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"META | Meta Platforms Inc. Company Profile & Executives - WSJ\\\", \\\"url\\\": \\\"https://www.wsj.com/market-data/quotes/META/company-people\\\", \\\"content\\\": \\\"Company profile for Meta Platforms Inc. including key executives, insider trading, ownership, revenue and average growth rates. View detailed META description & address.\\\", \\\"score\\\": 0.23361932, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " read it.\n3. Try a different file: If the file is not", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -34789,7 +34960,7 @@ "data": { "event": { "delta": { - "text": " accessible, try loading a different file to see if the issue is specific to", + "text": "The", "type": "text" }, "event_type": { @@ -34809,7 +34980,7 @@ "data": { "event": { "delta": { - "text": " this file or a general issue with your code.\n4. Check for ty", + "text": " current CEO of Meta is not explicitly stated in", "type": "text" }, "event_type": { @@ -34829,7 +35000,7 @@ "data": { "event": { "delta": { - "text": "pos: Ensure that there are no typos in the file path or the", + "text": " the search results. However, Mark Zuckerberg is mentioned as the CEO", "type": "text" }, "event_type": { @@ -34849,7 +35020,7 @@ "data": { "event": { "delta": { - "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", + "text": " of Meta in some of the search results, but it is not clear", "type": "text" }, "event_type": { @@ -34869,7 +35040,7 @@ "data": { "event": { "delta": { - "text": " you are using, and I'll be happy to help further.", + "text": " if he is still the current CEO.", "type": "text" }, "event_type": { @@ -34904,65 +35075,13 @@ "value": "end_of_turn" } }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262530+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 680 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262555+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 238 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "f28sT2i7", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262558+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 918 - } - ] + "metrics": null } } ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\\\", \\\"url\\\": \\\"https://observer.com/2024/01/meta-facebook-top-executives/\\\", \\\"content\\\": \\\"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\\\", \\\"score\\\": 0.45536873, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company.\\\", \\\"score\\\": 0.21026355, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -34990,13 +35109,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35015,13 +35129,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" + "text": " current CEO of Meta is Mark Zuckerberg.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35040,18 +35149,42 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -35068,9 +35201,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "started" }, - "tool_call": "8vwgyj6yjd3t4pwsy9t", + "tool_call": "", "type": "tool_call" }, "event_type": { @@ -35095,7 +35228,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", + "tool_call": "brave_search.call(query=\"current CEO of Meta\")", "type": "tool_call" }, "event_type": { @@ -35118,9 +35251,19 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "current CEO of Meta" + }, + "call_id": "cc85a2df-6b2d-41c0-97dd-1509ca8061c4", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } }, - "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", "type": "tool_call" }, "event_type": { @@ -35129,7 +35272,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -35140,18 +35287,42 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.70726365, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.467308, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.03678684, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -35165,13 +35336,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'], format='%Y')\n\n# Group by", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35190,13 +35356,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", - "type": "tool_call" + "text": " founder of Meta is Mark Zuckerberg.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35215,18 +35376,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 1220 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 18 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 1238 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.70726365, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.467308, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.03678684, \\\"raw_content\\\": null}]}\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -35240,13 +35441,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation as a time series\n", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35265,13 +35461,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", - "type": "tool_call" + "text": " founder of Meta is Mark Zuckerberg.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -35290,18 +35481,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'], df_avg_in", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 1220 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 18 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 1238 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -35318,9 +35549,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "started" }, - "tool_call": "flation['Inflation'], marker='o')\nplt", + "tool_call": "", "type": "tool_call" }, "event_type": { @@ -35345,7 +35576,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "tool_call": "brave_search.call(query", "type": "tool_call" }, "event_type": { @@ -35370,7 +35601,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "tool_call": "=\"Meta founder\")", "type": "tool_call" }, "event_type": { @@ -35397,13 +35628,13 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + "query": "Meta founder" }, - "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", + "call_id": "b81c41ae-5eb7-41b7-b466-78eb25a91bb7", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" + "value": "brave_search" } }, "type": "tool_call" @@ -35446,55 +35677,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953806+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 432 + "unit": null, + "value": 33 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953843+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 10 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953847+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 442 + "unit": null, + "value": 43 } ] } @@ -35502,7 +35697,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check the file name: Make sure the file name is correct and it matches the one you are trying to load.\\n3. Check the file format: Make sure the file is in the correct format (CSV) and it is not corrupted.\\n4. Try a different file: If the file is not available, try loading a different file to see if the issue is specific to this file or not.\\n\\nIf you are still having trouble, please provide more information about the file and the error message you are receiving, and I will do my best to assist you.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -35550,7 +35745,7 @@ "data": { "event": { "delta": { - "text": " error message", + "text": " function `get_boiling_point` is", "type": "text" }, "event_type": { @@ -35570,7 +35765,7 @@ "data": { "event": { "delta": { - "text": " indicates that", + "text": " not able to find the boiling point of poly", "type": "text" }, "event_type": { @@ -35590,7 +35785,7 @@ "data": { "event": { "delta": { - "text": " the file", + "text": "juice as it is a fictional liquid from the Harry Potter series", "type": "text" }, "event_type": { @@ -35610,7 +35805,7 @@ "data": { "event": { "delta": { - "text": " \"/var", + "text": ". The function is only able to find the boiling point of real", "type": "text" }, "event_type": { @@ -35630,7 +35825,7 @@ "data": { "event": { "delta": { - "text": "/folders", + "text": " liquids.", "type": "text" }, "event_type": { @@ -35650,33 +35845,58 @@ "data": { "event": { "delta": { - "text": "/rb", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 126 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "/qv", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -35690,7 +35910,7 @@ "data": { "event": { "delta": { - "text": "8vw", + "text": "The", "type": "text" }, "event_type": { @@ -35710,7 +35930,7 @@ "data": { "event": { "delta": { - "text": "gyj", + "text": " function `get_boiling_point", "type": "text" }, "event_type": { @@ -35730,7 +35950,7 @@ "data": { "event": { "delta": { - "text": "6y", + "text": "` is not able to find the boiling point of polyjuice", "type": "text" }, "event_type": { @@ -35750,7 +35970,7 @@ "data": { "event": { "delta": { - "text": "jd3", + "text": " as it is not a real liquid.", "type": "text" }, "event_type": { @@ -35770,33 +35990,58 @@ "data": { "event": { "delta": { - "text": "t4", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "p", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -35810,7 +36055,7 @@ "data": { "event": { "delta": { - "text": "wsy", + "text": "The", "type": "text" }, "event_type": { @@ -35830,7 +36075,7 @@ "data": { "event": { "delta": { - "text": "9t", + "text": " function `get_boiling_point` is not able to", "type": "text" }, "event_type": { @@ -35850,7 +36095,7 @@ "data": { "event": { "delta": { - "text": "0rm", + "text": " find the boiling point of polyju", "type": "text" }, "event_type": { @@ -35870,7 +36115,7 @@ "data": { "event": { "delta": { - "text": "0000", + "text": "ice as it is not a real liquid.", "type": "text" }, "event_type": { @@ -35890,33 +36135,58 @@ "data": { "event": { "delta": { - "text": "gn/T", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "/tmpx", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -35930,7 +36200,7 @@ "data": { "event": { "delta": { - "text": "gxj", + "text": "The", "type": "text" }, "event_type": { @@ -35950,7 +36220,7 @@ "data": { "event": { "delta": { - "text": "70y", + "text": " function `get_boiling_point` is not able to", "type": "text" }, "event_type": { @@ -35970,7 +36240,7 @@ "data": { "event": { "delta": { - "text": "_/Z", + "text": " find the boiling point of polyjuice as", "type": "text" }, "event_type": { @@ -35990,7 +36260,7 @@ "data": { "event": { "delta": { - "text": "qZ", + "text": " it is a fictional liquid from the Harry Potter series. The", "type": "text" }, "event_type": { @@ -36010,7 +36280,7 @@ "data": { "event": { "delta": { - "text": "39W", + "text": " function is only able to find the boiling point of real liquids", "type": "text" }, "event_type": { @@ -36030,7 +36300,7 @@ "data": { "event": { "delta": { - "text": "iyin", + "text": ".", "type": "text" }, "event_type": { @@ -36050,33 +36320,58 @@ "data": { "event": { "delta": { - "text": "flation.csv", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 56 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 126 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "\" does", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36090,7 +36385,7 @@ "data": { "event": { "delta": { - "text": " not exist", + "text": "The", "type": "text" }, "event_type": { @@ -36110,7 +36405,7 @@ "data": { "event": { "delta": { - "text": ". This", + "text": " function `get_boiling_point` is not able to find the boiling point", "type": "text" }, "event_type": { @@ -36130,7 +36425,7 @@ "data": { "event": { "delta": { - "text": " could be", + "text": " of polyjuice as it is not a real liquid.", "type": "text" }, "event_type": { @@ -36150,33 +36445,58 @@ "data": { "event": { "delta": { - "text": " due to", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " a", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36190,7 +36510,7 @@ "data": { "event": { "delta": { - "text": " number of", + "text": "The", "type": "text" }, "event_type": { @@ -36210,7 +36530,7 @@ "data": { "event": { "delta": { - "text": " reasons,", + "text": " function `get_boiling_point` is not able to find the", "type": "text" }, "event_type": { @@ -36230,7 +36550,7 @@ "data": { "event": { "delta": { - "text": " such as", + "text": " boiling point of polyjuice as", "type": "text" }, "event_type": { @@ -36250,7 +36570,7 @@ "data": { "event": { "delta": { - "text": " the file", + "text": " it is not a real liquid.", "type": "text" }, "event_type": { @@ -36270,33 +36590,58 @@ "data": { "event": { "delta": { - "text": " being deleted", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 70 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 38 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " or moved", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36310,8 +36655,13 @@ "data": { "event": { "delta": { - "text": ", or", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36330,8 +36680,13 @@ "data": { "event": { "delta": { - "text": " the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling_point\",", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36350,8 +36705,13 @@ "data": { "event": { "delta": { - "text": " path being", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " \"parameters\": {\"liquid_name\": \"polyjuice\"}}", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36370,8 +36730,19 @@ "data": { "event": { "delta": { - "text": " incorrect.\n\n", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "liquid_name": "polyjuice" + }, + "call_id": "b63f9b8c-c514-48bb-8e0f-788b29c1c106", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36379,7 +36750,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -36390,33 +36765,58 @@ "data": { "event": { "delta": { - "text": "To resolve", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " this issue", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36430,8 +36830,13 @@ "data": { "event": { "delta": { - "text": ", you", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36450,8 +36855,13 @@ "data": { "event": { "delta": { - "text": " can try", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36470,8 +36880,13 @@ "data": { "event": { "delta": { - "text": " the following", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36490,8 +36905,13 @@ "data": { "event": { "delta": { - "text": ":\n\n1", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\"}}", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36510,8 +36930,19 @@ "data": { "event": { "delta": { - "text": ". Check", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "liquid_name": "polyjuice" + }, + "call_id": "ec121f44-66e0-47e8-971a-211142998c65", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36519,7 +36950,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -36530,33 +36965,58 @@ "data": { "event": { "delta": { - "text": " the file", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " path:", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36570,7 +37030,7 @@ "data": { "event": { "delta": { - "text": " Make sure", + "text": "I", "type": "text" }, "event_type": { @@ -36590,7 +37050,7 @@ "data": { "event": { "delta": { - "text": " the file", + "text": " couldn't find any information on the boiling point of Polyjuice", "type": "text" }, "event_type": { @@ -36610,7 +37070,7 @@ "data": { "event": { "delta": { - "text": " path is", + "text": ". Polyjuice is a magical potion in the Harry Potter series", "type": "text" }, "event_type": { @@ -36630,7 +37090,7 @@ "data": { "event": { "delta": { - "text": " correct and", + "text": " that allows the drinker to transform into someone else.", "type": "text" }, "event_type": { @@ -36650,7 +37110,7 @@ "data": { "event": { "delta": { - "text": " the file", + "text": " It's not a physical substance with a boiling point. If", "type": "text" }, "event_type": { @@ -36670,7 +37130,7 @@ "data": { "event": { "delta": { - "text": " exists at", + "text": " you have any other questions, I'd", "type": "text" }, "event_type": { @@ -36690,7 +37150,7 @@ "data": { "event": { "delta": { - "text": " that location", + "text": " be happy to help.", "type": "text" }, "event_type": { @@ -36710,33 +37170,58 @@ "data": { "event": { "delta": { - "text": ".\n", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 73 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 103 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "2", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36750,8 +37235,13 @@ "data": { "event": { "delta": { - "text": ". Check", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36770,8 +37260,13 @@ "data": { "event": { "delta": { - "text": " the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36790,8 +37285,13 @@ "data": { "event": { "delta": { - "text": " name:", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36810,8 +37310,13 @@ "data": { "event": { "delta": { - "text": " Make sure", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\"}}", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36830,8 +37335,19 @@ "data": { "event": { "delta": { - "text": " the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "liquid_name": "polyjuice" + }, + "call_id": "1ca40c99-853b-44e3-ab2c-f194e3ed1b45", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -36839,7 +37355,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -36850,33 +37370,58 @@ "data": { "event": { "delta": { - "text": " name is", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 40 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef get_nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stderr]\\nTraceback (most recent call last):\\n line 5, in \\n from bwrap.core import main\\nModuleNotFoundError: No module named 'bwrap.core'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " correct and", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36890,7 +37435,7 @@ "data": { "event": { "delta": { - "text": " it matches", + "text": "The", "type": "text" }, "event_type": { @@ -36910,7 +37455,7 @@ "data": { "event": { "delta": { - "text": " the", + "text": " 100th prime number is 541.", "type": "text" }, "event_type": { @@ -36930,33 +37475,42 @@ "data": { "event": { "delta": { - "text": " one you", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef get_nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\n541\\n[/stdout]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " are trying", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -36970,7 +37524,7 @@ "data": { "event": { "delta": { - "text": " to", + "text": "The", "type": "text" }, "event_type": { @@ -36990,7 +37544,7 @@ "data": { "event": { "delta": { - "text": " load", + "text": " 100th prime number is 541.", "type": "text" }, "event_type": { @@ -37010,33 +37564,58 @@ "data": { "event": { "delta": { - "text": ".\n3", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 217 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 20 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 237 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef get_nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(get_nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"completed\\n[stdout]\\n541\\n[/stdout]\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": ". Check", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37050,7 +37629,7 @@ "data": { "event": { "delta": { - "text": " the", + "text": "The", "type": "text" }, "event_type": { @@ -37070,7 +37649,7 @@ "data": { "event": { "delta": { - "text": " file format", + "text": " 100th prime number is 541.", "type": "text" }, "event_type": { @@ -37090,33 +37669,58 @@ "data": { "event": { "delta": { - "text": ": Make", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 217 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 20 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 237 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " sure", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37130,8 +37734,13 @@ "data": { "event": { "delta": { - "text": " the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37150,8 +37759,13 @@ "data": { "event": { "delta": { - "text": " is in", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "def is_prime(n):\n if n <= 1:\n return False", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37170,8 +37784,13 @@ "data": { "event": { "delta": { - "text": " the correct", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n if n <= 3:\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37190,8 +37809,13 @@ "data": { "event": { "delta": { - "text": " format (", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return True\n if n % 2 ==", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37210,8 +37834,13 @@ "data": { "event": { "delta": { - "text": "CSV)", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 0 or n % 3 == 0:\n return False", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37230,8 +37859,13 @@ "data": { "event": { "delta": { - "text": " and it", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n i = 5\n while i * i <= n:\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37250,8 +37884,13 @@ "data": { "event": { "delta": { - "text": " is not", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " if n % i == 0 or n % (i + 2", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37270,8 +37909,13 @@ "data": { "event": { "delta": { - "text": " corrupted.\n", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ") == 0:\n return False\n i += 6\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37290,8 +37934,13 @@ "data": { "event": { "delta": { - "text": "4.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " return True\n\ndef get_nth_prime(n):\n count = 0\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37310,8 +37959,13 @@ "data": { "event": { "delta": { - "text": " Try a", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " num = 2\n while True:\n if is_prime(num):\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37330,8 +37984,13 @@ "data": { "event": { "delta": { - "text": " different file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " count += 1\n if count == n:\n return num\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37350,8 +38009,13 @@ "data": { "event": { "delta": { - "text": ": If", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " num += 1\n\nprint(get_nth_prime(100))", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37370,8 +38034,23 @@ "data": { "event": { "delta": { - "text": " the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef get_nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(get_nth_prime(100))" + }, + "call_id": "d8ece88b-7b3e-4f72-9555-5a928c27012c", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37379,7 +38058,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -37390,33 +38073,58 @@ "data": { "event": { "delta": { - "text": " is not", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 40 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 50 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " available,", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37430,7 +38138,7 @@ "data": { "event": { "delta": { - "text": " try loading", + "text": "Per", "type": "text" }, "event_type": { @@ -37450,7 +38158,7 @@ "data": { "event": { "delta": { - "text": " a different", + "text": "plexity the company was founded in 202", "type": "text" }, "event_type": { @@ -37470,7 +38178,7 @@ "data": { "event": { "delta": { - "text": " file to", + "text": "2.", "type": "text" }, "event_type": { @@ -37490,33 +38198,58 @@ "data": { "event": { "delta": { - "text": " see if", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 105 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 127 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37530,7 +38263,7 @@ "data": { "event": { "delta": { - "text": " issue is", + "text": "{\"", "type": "text" }, "event_type": { @@ -37550,7 +38283,7 @@ "data": { "event": { "delta": { - "text": " specific to", + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"", "type": "text" }, "event_type": { @@ -37570,7 +38303,7 @@ "data": { "event": { "delta": { - "text": " this", + "text": "parameters\": {\"query\": \"Perplexity company founding date\"}}", "type": "text" }, "event_type": { @@ -37590,8 +38323,19 @@ "data": { "event": { "delta": { - "text": " file or", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Perplexity company founding date" + }, + "call_id": "5ea88dde-f090-4157-9219-45a16100ef21", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37599,7 +38343,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -37610,53 +38358,58 @@ "data": { "event": { "delta": { - "text": " not.\n\n", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "If you", - "type": "text" + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 67 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "metric": "completion_tokens", + "unit": null, + "value": 37 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "metric": "total_tokens", + "unit": null, + "value": 104 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " are still", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37670,7 +38423,7 @@ "data": { "event": { "delta": { - "text": " having trouble", + "text": "Per", "type": "text" }, "event_type": { @@ -37690,7 +38443,7 @@ "data": { "event": { "delta": { - "text": ",", + "text": "plexity the company was founded in 2022.", "type": "text" }, "event_type": { @@ -37710,33 +38463,58 @@ "data": { "event": { "delta": { - "text": " please provide", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 105 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 22 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 127 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " more information", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37750,7 +38528,7 @@ "data": { "event": { "delta": { - "text": " about the", + "text": "{\"", "type": "text" }, "event_type": { @@ -37770,7 +38548,7 @@ "data": { "event": { "delta": { - "text": " file and", + "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters", "type": "text" }, "event_type": { @@ -37790,7 +38568,7 @@ "data": { "event": { "delta": { - "text": " the error", + "text": "\": {\"query\": \"Perplexity company founding", "type": "text" }, "event_type": { @@ -37810,7 +38588,7 @@ "data": { "event": { "delta": { - "text": " message you", + "text": " date\"}}", "type": "text" }, "event_type": { @@ -37830,8 +38608,19 @@ "data": { "event": { "delta": { - "text": " are receiving", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Perplexity company founding date" + }, + "call_id": "7f40db23-2182-4006-9234-4c5b7dac978f", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37839,7 +38628,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -37850,33 +38643,58 @@ "data": { "event": { "delta": { - "text": ", and", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 67 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 104 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " I will", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -37890,8 +38708,13 @@ "data": { "event": { "delta": { - "text": " do my", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37910,8 +38733,13 @@ "data": { "event": { "delta": { - "text": " best to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37930,8 +38758,13 @@ "data": { "event": { "delta": { - "text": " assist you", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\": {\"query\": \"Perplexity company founding date\"}}", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37950,8 +38783,19 @@ "data": { "event": { "delta": { - "text": ".", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Perplexity company founding date" + }, + "call_id": "7f65affe-6ecb-4db5-b70f-71e05e28c310", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -37959,7 +38803,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -37989,17 +38837,17 @@ { "metric": "prompt_tokens", "unit": null, - "value": 732 + "value": 29 }, { "metric": "completion_tokens", "unit": null, - "value": 239 + "value": 10 }, { "metric": "total_tokens", "unit": null, - "value": 971 + "value": 39 } ] } @@ -38007,7 +38855,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check the file name: Make sure the file name is correct and it matches the one you are trying to load.\\n3. Check the file format: Make sure the file is in the correct format (CSV) and it is not corrupted.\\n4. Try a different file: If the file is not available, try loading a different file to see if the issue is specific to this file or not.\\n\\nIf you are still having trouble, please provide more information about the file and the error message you are receiving, and I will do my best to assist you.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -38035,13 +38883,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38060,13 +38903,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas", - "type": "tool_call" + "text": " NBA was created on August 3, 1949, with the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38085,13 +38923,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " as pd", - "type": "tool_call" + "text": " merger of the Basketball Association of America (BAA) and the National", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38110,13 +38943,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n", - "type": "tool_call" + "text": " Basketball League (NBL).", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38135,43 +38963,42 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import matplotlib", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".pyplot as", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -38185,13 +39012,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " plt\n\n", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38210,13 +39032,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Load", - "type": "tool_call" + "text": " NBA was created on August 3, 1949, with", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38235,13 +39052,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the CSV", - "type": "tool_call" + "text": " the merger of the Basketball Association of America (BAA) and", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38260,13 +39072,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\n", - "type": "tool_call" + "text": " the National Basketball League (NBL).", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38285,43 +39092,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "df =", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 65 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"when was the nba created\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " pd.read", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -38335,13 +39157,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_csv(\"/", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38360,13 +39177,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "var/f", - "type": "tool_call" + "text": " NBA was created on August 3,", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38385,13 +39197,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "olders/r", - "type": "tool_call" + "text": " 1949, with the merger of the Basketball Association of", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38410,13 +39217,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "b/q", - "type": "tool_call" + "text": " America (BAA) and the National Basketball League (NBL", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38435,13 +39237,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "v8", - "type": "tool_call" + "text": ").", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -38460,43 +39257,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "vw", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 65 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 45 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 110 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gy", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -38513,9 +39325,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "started" }, - "tool_call": "j6", + "tool_call": "", "type": "tool_call" }, "event_type": { @@ -38540,7 +39352,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "yjd", + "tool_call": "{\"type\": \"function\", \"name\": \"", "type": "tool_call" }, "event_type": { @@ -38565,7 +39377,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "3t", + "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"when", "type": "tool_call" }, "event_type": { @@ -38590,7 +39402,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "4p", + "tool_call": " was the nba created\"}}", "type": "tool_call" }, "event_type": { @@ -38613,14090 +39425,16 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "ws", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "y", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "9t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0000", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gn", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/T", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmpx", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gxj", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "70y", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_/Z", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "qZ", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "39W", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "iyin", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation.csv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\")\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Convert", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the '", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column to", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " datetime\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "df['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year']", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " = pd", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".to_datetime", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(df['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'],", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " format='%", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Y')\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Group", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " by '", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " and calculate", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the average", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " inflation\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "df_avg", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " = df", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".groupby('", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year')['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'].mean", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "().", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "reset", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_index()\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the average", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " as", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " a time", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " series\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".figure(figsize", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "=(10", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ",6", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "))\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".plot(df", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'],", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " df_avg", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "In", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation'], marker='o", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "')\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Average", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Yearly", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "')\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".xlabel('", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year')\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.ylabel('Inflation')\nplt.grid(True)\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.show", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "()", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpxgxj70y_/ZqZ39Wiyinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "48277e37-1992-4510-9751-9895707cb190", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 484 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 10 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 494 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " due to a variety of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " reasons such as the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " being deleted, the path being incorrect, or the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " not being accessible.\n\nTo resolve this issue, you can try", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the following:\n\n1. Check the file path: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file path is correct and the file exists at that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " location.\n2. Check file permissions: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file is accessible and", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " you have the necessary permissions to read", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " it.\n3. Try a different file: If", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file is not accessible, try loading a different file to see", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " if the issue is specific to this file or a general", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " issue with your code.\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "4. Check for typos: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " there are no typos in the file path or the code.\n\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "If you are", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " still having issues, please provide more details about", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file and the code you are using", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", and I'll be happy to help further.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630894+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 192 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630987+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 238 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630996+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 430 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " indicates that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " \"/", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "var/f", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "olders/r", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "b/q", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "v8", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "vwgy", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "j6", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "yjd", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "3t", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "4p", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "wsy", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "9", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "t0", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "rm000", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "0gn", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "/T/tmp", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "xgx", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "j70", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "y_/", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Zq", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Z39", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Wiy", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "inflation", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".csv", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\" does", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " not exist", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ". This", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " could be", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " due to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " a", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " number of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " reasons,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " such as", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " being deleted", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " or", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " moved,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " or the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " path being", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " incorrect.\n\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "To resolve", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " this issue", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", you", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " can try", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the following", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ":\n\n1", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Check the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file path", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ": Make", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " sure the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file path", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " correct", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " and the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file exists", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " at that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " location", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "2.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Check the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file name", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ": Make", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " sure the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file name", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is correct", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " and it", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " matches the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " one you", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " are trying", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to load", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".\n3", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ". Check", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " format:", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Make sure", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " in", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the correct format (CSV)", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " and it is not", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " corrupted.\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "4", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Try a", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " different file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ":", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " If", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is not", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " available,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " try loading", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " a different", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " see if", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " issue is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " specific to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " this file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " or not", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".\n\nIf", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " you are", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " still having", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " trouble,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " please provide", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " more information", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " about the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file and", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the error", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " message", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " you are", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " receiving,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " and I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " will", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " do my", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " best to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " assist you", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 243 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 239 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 482 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " as pd", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Load the", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " CSV file", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "df =", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " pd.read", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_csv(\"/", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "var/f", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "olders/r", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "b/q", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "v8", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "vwgy", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "j6", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "yjd", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "3", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "4p", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "wsy", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "9t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "000", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gn/T", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmpx", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gxj70y_/ZqZ39", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Wiy", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "inflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".csv\")\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the first", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " few rows", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " of the", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " dataframe\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(df", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".head())\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the shape", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " of the", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " dataframe (", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "number", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " of rows", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " and columns", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ")\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(df.shape", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ")\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Print the", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column names", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\nprint", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(df.columns", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ")\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Print the", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data types", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " of each", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(df", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".dtypes", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ")\n\n#", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Print a", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " summary of", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " dataframe (", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "count,", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " mean,", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " std", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ",", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " min,", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 25", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "%, ", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "50%,", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 75", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "%, max", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ")\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(df.describe", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "())", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpxgxj70y_/ZqZ39Wiyinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the shape of the dataframe (number of rows and columns)\nprint(df.shape)\n\n# Print the column names\nprint(df.columns)\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\nprint(df.describe())" - }, - "call_id": "c3783091-f1cf-49e0-bc3b-a827618dcbe0", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 36 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 10 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 46 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673350+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 107 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673375+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673381+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 130 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready to help. What's", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " your question about Torchtune?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179269+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179301+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 25 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179308+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", - "value": 100 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209198+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 108 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209239+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209247+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 131 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready to help. What's", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " your first question about Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525734+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 75 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525763+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 26 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "mYTkxvK_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525770+00:00", - "__module__": "datetime" - }, - "trace_id": "kpcdkZQ2SsSOh9Lw", - "type": "metric", - "unit": "tokens", - "value": 101 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:89553\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:122a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:700ad\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:89553\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:122a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "(query=\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "using Lo", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "RA in", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Torcht", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "une\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "0d7b903c-c045-4c45-88d8-1200d809b37a", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 107 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 23 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 130 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:89553\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:700ad\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:122a9\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to help", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ". What", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'s your", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " question", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " about", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Torcht", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "une?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 75 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 25 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b065e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c96ea\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:fe9fc\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "5", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -52775,7 +39522,7 @@ "data": { "event": { "delta": { - "text": ". Run", + "text": "The", "type": "text" }, "event_type": { @@ -52795,7 +39542,7 @@ "data": { "event": { "delta": { - "text": " the", + "text": " provided function definitions", "type": "text" }, "event_type": { @@ -52815,7 +39562,7 @@ "data": { "event": { "delta": { - "text": " Lo", + "text": " are not suitable", "type": "text" }, "event_type": { @@ -52835,7 +39582,7 @@ "data": { "event": { "delta": { - "text": "RA fin", + "text": " for this task. Please re", "type": "text" }, "event_type": { @@ -52855,7 +39602,7 @@ "data": { "event": { "delta": { - "text": "etune", + "text": "work them to", "type": "text" }, "event_type": { @@ -52875,7 +39622,7 @@ "data": { "event": { "delta": { - "text": " using torch", + "text": " align with the task requirements.", "type": "text" }, "event_type": { @@ -52895,53 +39642,94 @@ "data": { "event": { "delta": { - "text": "tune", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'s Lo", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "D2n_IS_8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:32.021393+00:00", + "__module__": "datetime" + }, + "trace_id": "amAiZv5PQKSsA74j", + "type": "metric", + "unit": "tokens", + "value": 90 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "D2n_IS_8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:32.021420+00:00", + "__module__": "datetime" + }, + "trace_id": "amAiZv5PQKSsA74j", + "type": "metric", + "unit": "tokens", + "value": 32 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "D2n_IS_8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:32.021427+00:00", + "__module__": "datetime" + }, + "trace_id": "amAiZv5PQKSsA74j", + "type": "metric", + "unit": "tokens", + "value": 122 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "RA recipe", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -52955,7 +39743,7 @@ "data": { "event": { "delta": { - "text": ": `", + "text": "[", "type": "text" }, "event_type": { @@ -52975,7 +39763,7 @@ "data": { "event": { "delta": { - "text": "tune", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -52995,7 +39783,7 @@ "data": { "event": { "delta": { - "text": " run --", + "text": "=True)]", "type": "text" }, "event_type": { @@ -53015,8 +39803,20 @@ "data": { "event": { "delta": { - "text": "nnodes", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "fc83cd58-3cfb-431d-a1e2-a8572d682e2f", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -53024,7 +39824,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -53035,53 +39839,94 @@ "data": { "event": { "delta": { - "text": " 1", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " --n", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "YhFB39Ik", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:31.335148+00:00", + "__module__": "datetime" + }, + "trace_id": "3n2xEtjLQt6ZGVR_", + "type": "metric", + "unit": "tokens", + "value": 267 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "YhFB39Ik", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:31.335179+00:00", + "__module__": "datetime" + }, + "trace_id": "3n2xEtjLQt6ZGVR_", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "YhFB39Ik", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:31.335185+00:00", + "__module__": "datetime" + }, + "trace_id": "3n2xEtjLQt6ZGVR_", + "type": "metric", + "unit": "tokens", + "value": 295 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "proc_per", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -53095,7 +39940,7 @@ "data": { "event": { "delta": { - "text": "_node ", + "text": "[", "type": "text" }, "event_type": { @@ -53115,7 +39960,7 @@ "data": { "event": { "delta": { - "text": "2 l", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -53135,7 +39980,7 @@ "data": { "event": { "delta": { - "text": "ora_f", + "text": "=True)]", "type": "text" }, "event_type": { @@ -53155,8 +40000,20 @@ "data": { "event": { "delta": { - "text": "inetune", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "7d41a671-f3ce-46dd-b001-443aaa65ccb7", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -53164,7 +40021,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -53175,53 +40036,94 @@ "data": { "event": { "delta": { - "text": "_distributed", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " --config", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "lnqeV_cZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:29.708270+00:00", + "__module__": "datetime" + }, + "trace_id": "me4qbUSCQ5yKvrAG", + "type": "metric", + "unit": "tokens", + "value": 211 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "lnqeV_cZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:29.708281+00:00", + "__module__": "datetime" + }, + "trace_id": "me4qbUSCQ5yKvrAG", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "lnqeV_cZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:29.708284+00:00", + "__module__": "datetime" + }, + "trace_id": "me4qbUSCQ5yKvrAG", + "type": "metric", + "unit": "tokens", + "value": 239 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " llama2", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -53235,7 +40137,7 @@ "data": { "event": { "delta": { - "text": "/7", + "text": "[", "type": "text" }, "event_type": { @@ -53255,7 +40157,7 @@ "data": { "event": { "delta": { - "text": "B_l", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -53275,7 +40177,7 @@ "data": { "event": { "delta": { - "text": "ora`\n\n", + "text": "=True)]", "type": "text" }, "event_type": { @@ -53295,8 +40197,20 @@ "data": { "event": { "delta": { - "text": "You can", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "21c8e60f-d205-4b3d-b065-47fa56dcd273", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -53304,7 +40218,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -53315,33 +40233,94 @@ "data": { "event": { "delta": { - "text": " also experiment", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "TDJHPVDZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:28.195776+00:00", + "__module__": "datetime" + }, + "trace_id": "r2GKj8iqTYaNxTeq", + "type": "metric", + "unit": "tokens", + "value": 155 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "TDJHPVDZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:28.195808+00:00", + "__module__": "datetime" + }, + "trace_id": "r2GKj8iqTYaNxTeq", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "TDJHPVDZ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:28.195814+00:00", + "__module__": "datetime" + }, + "trace_id": "r2GKj8iqTYaNxTeq", + "type": "metric", + "unit": "tokens", + "value": 183 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " with different", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -53355,7 +40334,7 @@ "data": { "event": { "delta": { - "text": " LoRA", + "text": "[", "type": "text" }, "event_type": { @@ -53375,7 +40354,7 @@ "data": { "event": { "delta": { - "text": " configurations,", + "text": "get_boiling_point(liquid_name=\"polyjuice\", celcius", "type": "text" }, "event_type": { @@ -53395,7 +40374,7 @@ "data": { "event": { "delta": { - "text": " such as", + "text": "=True)]", "type": "text" }, "event_type": { @@ -53415,8 +40394,20 @@ "data": { "event": { "delta": { - "text": " applying Lo", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "135d468e-6391-401d-a3c0-3b08c3a6eb8c", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -53424,7 +40415,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -53435,33 +40430,94 @@ "data": { "event": { "delta": { - "text": "RA to", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "8pZtsyNW", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:47:51.321089+00:00", + "__module__": "datetime" + }, + "trace_id": "1Ly70plQQGel5jgc", + "type": "metric", + "unit": "tokens", + "value": 99 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "8pZtsyNW", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:47:51.321130+00:00", + "__module__": "datetime" + }, + "trace_id": "1Ly70plQQGel5jgc", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "8pZtsyNW", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:47:51.321140+00:00", + "__module__": "datetime" + }, + "trace_id": "1Ly70plQQGel5jgc", + "type": "metric", + "unit": "tokens", + "value": 127 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " all linear", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -53475,7 +40531,7 @@ "data": { "event": { "delta": { - "text": " layers in", + "text": "[", "type": "text" }, "event_type": { @@ -53495,7 +40551,7 @@ "data": { "event": { "delta": { - "text": " the self", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -53515,7 +40571,7 @@ "data": { "event": { "delta": { - "text": "-attention", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -53535,8 +40591,20 @@ "data": { "event": { "delta": { - "text": ", increasing", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "3955f756-9aa0-433f-be8f-af8941c220de", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -53544,7 +40612,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -53555,33 +40627,94 @@ "data": { "event": { "delta": { - "text": " the rank", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "QZ6PSGpT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:29.629456+00:00", + "__module__": "datetime" + }, + "trace_id": "M72bosg8TBe3uhx3", + "type": "metric", + "unit": "tokens", + "value": 43 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "QZ6PSGpT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:29.629488+00:00", + "__module__": "datetime" + }, + "trace_id": "M72bosg8TBe3uhx3", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "QZ6PSGpT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:29.629494+00:00", + "__module__": "datetime" + }, + "trace_id": "M72bosg8TBe3uhx3", + "type": "metric", + "unit": "tokens", + "value": 71 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": ", and", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -53595,7 +40728,7 @@ "data": { "event": { "delta": { - "text": " scaling alpha", + "text": "The", "type": "text" }, "event_type": { @@ -53615,7 +40748,7 @@ "data": { "event": { "delta": { - "text": " and", + "text": " function call returned an", "type": "text" }, "event_type": { @@ -53635,7 +40768,7 @@ "data": { "event": { "delta": { - "text": " rank", + "text": " error since", "type": "text" }, "event_type": { @@ -53655,7 +40788,7 @@ "data": { "event": { "delta": { - "text": " together.\n\n", + "text": " \"", "type": "text" }, "event_type": { @@ -53675,7 +40808,7 @@ "data": { "event": { "delta": { - "text": "Note:", + "text": "polyjuice\" is", "type": "text" }, "event_type": { @@ -53695,7 +40828,7 @@ "data": { "event": { "delta": { - "text": " You need", + "text": " not a real liquid. Polyju", "type": "text" }, "event_type": { @@ -53715,7 +40848,7 @@ "data": { "event": { "delta": { - "text": " to have", + "text": "ice is a fictional substance from the", "type": "text" }, "event_type": { @@ -53735,7 +40868,7 @@ "data": { "event": { "delta": { - "text": " the pre", + "text": " Harry Potter series. The boiling point", "type": "text" }, "event_type": { @@ -53755,7 +40888,7 @@ "data": { "event": { "delta": { - "text": "-trained L", + "text": " of a substance is a physical", "type": "text" }, "event_type": { @@ -53775,7 +40908,7 @@ "data": { "event": { "delta": { - "text": "lama2", + "text": " property that can be measured and", "type": "text" }, "event_type": { @@ -53795,7 +40928,7 @@ "data": { "event": { "delta": { - "text": " weights and", + "text": " quantified", "type": "text" }, "event_type": { @@ -53815,7 +40948,7 @@ "data": { "event": { "delta": { - "text": " tokenizer downloaded", + "text": ", but it only applies", "type": "text" }, "event_type": { @@ -53835,7 +40968,7 @@ "data": { "event": { "delta": { - "text": " and installed", + "text": " to real substances that exist in the physical world.", "type": "text" }, "event_type": { @@ -53855,33 +40988,94 @@ "data": { "event": { "delta": { - "text": " before running", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "y9SHtJTQ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:01.411612+00:00", + "__module__": "datetime" + }, + "trace_id": "_I2Cu85IRtOSBSX9", + "type": "metric", + "unit": "tokens", + "value": 84 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "y9SHtJTQ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:01.411644+00:00", + "__module__": "datetime" + }, + "trace_id": "_I2Cu85IRtOSBSX9", + "type": "metric", + "unit": "tokens", + "value": 73 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "y9SHtJTQ", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:01.411650+00:00", + "__module__": "datetime" + }, + "trace_id": "_I2Cu85IRtOSBSX9", + "type": "metric", + "unit": "tokens", + "value": 157 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " the Lo", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -53895,7 +41089,7 @@ "data": { "event": { "delta": { - "text": "RA fin", + "text": "The", "type": "text" }, "event_type": { @@ -53915,7 +41109,7 @@ "data": { "event": { "delta": { - "text": "et", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -53935,7 +41129,7 @@ "data": { "event": { "delta": { - "text": "une.", + "text": " recognized.", "type": "text" }, "event_type": { @@ -53972,19 +41166,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 146 + "span_id": "Z7jBGJ-8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:55.401637+00:00", + "__module__": "datetime" + }, + "trace_id": "WxMAq579Q-ixJ3wJ", + "type": "metric", + "unit": "tokens", + "value": 93 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 272 + "span_id": "Z7jBGJ-8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:55.401666+00:00", + "__module__": "datetime" + }, + "trace_id": "WxMAq579Q-ixJ3wJ", + "type": "metric", + "unit": "tokens", + "value": 20 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 418 + "span_id": "Z7jBGJ-8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:55.401670+00:00", + "__module__": "datetime" + }, + "trace_id": "WxMAq579Q-ixJ3wJ", + "type": "metric", + "unit": "tokens", + "value": 113 } ] } @@ -53992,7 +41222,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b065e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c96ea\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -54020,7 +41250,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "The", "type": "text" }, "event_type": { @@ -54040,7 +41270,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search", + "text": " function get_bo", "type": "text" }, "event_type": { @@ -54060,7 +41290,7 @@ "data": { "event": { "delta": { - "text": "(query=\"", + "text": "iling_point_with_metadata does not exist,", "type": "text" }, "event_type": { @@ -54080,7 +41310,7 @@ "data": { "event": { "delta": { - "text": "using Lo", + "text": " I will", "type": "text" }, "event_type": { @@ -54100,7 +41330,7 @@ "data": { "event": { "delta": { - "text": "RA in", + "text": " assume you", "type": "text" }, "event_type": { @@ -54120,7 +41350,7 @@ "data": { "event": { "delta": { - "text": " Torcht", + "text": " meant get_bo", "type": "text" }, "event_type": { @@ -54140,7 +41370,7 @@ "data": { "event": { "delta": { - "text": "une", + "text": "iling_point_with_metadata", "type": "text" }, "event_type": { @@ -54160,7 +41390,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": ". The boiling point of polyjuice", "type": "text" }, "event_type": { @@ -54180,19 +41410,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "2bfd3d0d-5172-4031-af00-fd6c456e4fd9", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " is -100.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -54200,11 +41419,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -54232,19 +41447,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 107 + "span_id": "8dM6i5mO", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:03.329281+00:00", + "__module__": "datetime" + }, + "trace_id": "zMJDP5dXRrChi7uE", + "type": "metric", + "unit": "tokens", + "value": 86 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 23 + "span_id": "8dM6i5mO", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:03.329312+00:00", + "__module__": "datetime" + }, + "trace_id": "zMJDP5dXRrChi7uE", + "type": "metric", + "unit": "tokens", + "value": 45 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 130 + "span_id": "8dM6i5mO", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:03.329318+00:00", + "__module__": "datetime" + }, + "trace_id": "zMJDP5dXRrChi7uE", + "type": "metric", + "unit": "tokens", + "value": 131 } ] } @@ -54252,7 +41503,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:b065e\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:fe9fc\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c96ea\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -54280,7 +41531,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "The", "type": "text" }, "event_type": { @@ -54300,7 +41551,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What", + "text": " function get_boiling_point_with_metadata(", "type": "text" }, "event_type": { @@ -54320,7 +41571,7 @@ "data": { "event": { "delta": { - "text": "'s your", + "text": "liquid_name=\"polyjuice\", celcius=True) should be", "type": "text" }, "event_type": { @@ -54340,7 +41591,7 @@ "data": { "event": { "delta": { - "text": " question about", + "text": " used to get the answer.", "type": "text" }, "event_type": { @@ -54360,13 +41611,94 @@ "data": { "event": { "delta": { - "text": " Tor", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "pzQMKAJc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:56.809816+00:00", + "__module__": "datetime" + }, + "trace_id": "018KkGcOThSSiZfE", + "type": "metric", + "unit": "tokens", + "value": 97 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "pzQMKAJc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:56.809911+00:00", + "__module__": "datetime" + }, + "trace_id": "018KkGcOThSSiZfE", + "type": "metric", + "unit": "tokens", + "value": 39 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "pzQMKAJc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:56.809922+00:00", + "__module__": "datetime" + }, + "trace_id": "018KkGcOThSSiZfE", + "type": "metric", + "unit": "tokens", + "value": 136 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" }, "logprobs": null, "stop_reason": null @@ -54380,7 +41712,7 @@ "data": { "event": { "delta": { - "text": "chtune", + "text": "[", "type": "text" }, "event_type": { @@ -54400,7 +41732,7 @@ "data": { "event": { "delta": { - "text": "?", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -54420,61 +41752,52 @@ "data": { "event": { "delta": { - "text": "", + "text": "', celcius=True)]", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 75 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 25 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "328cb19d-47bb-47cc-8258-a5ca2e26803e", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -54485,33 +41808,94 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "dS0bhfN_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:53.324788+00:00", + "__module__": "datetime" + }, + "trace_id": "UJz5Cas1SDyQYeBk", + "type": "metric", + "unit": "tokens", + "value": 37 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "dS0bhfN_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:53.324835+00:00", + "__module__": "datetime" + }, + "trace_id": "UJz5Cas1SDyQYeBk", + "type": "metric", + "unit": "tokens", + "value": 28 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "dS0bhfN_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:53.324844+00:00", + "__module__": "datetime" + }, + "trace_id": "UJz5Cas1SDyQYeBk", + "type": "metric", + "unit": "tokens", + "value": 65 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "nowledge_search", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -54525,7 +41909,7 @@ "data": { "event": { "delta": { - "text": "(query=\"", + "text": "[", "type": "text" }, "event_type": { @@ -54545,7 +41929,7 @@ "data": { "event": { "delta": { - "text": "Torcht", + "text": "get_boiling_point_with_metadata", "type": "text" }, "event_type": { @@ -54565,7 +41949,7 @@ "data": { "event": { "delta": { - "text": "une documentation", + "text": "(liquid_name='polyjuice', cel", "type": "text" }, "event_type": { @@ -54585,7 +41969,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": "cius=True)]", "type": "text" }, "event_type": { @@ -54612,10 +41996,11 @@ }, "tool_call": { "arguments": { - "query": "Torchtune documentation" + "celcius": true, + "liquid_name": "polyjuice" }, - "call_id": "70cd0350-8689-4bb5-a0bf-2a9d2112d08d", - "tool_name": "knowledge_search" + "call_id": "5bb48d00-7d5c-49e2-bddf-e5fdc5f35485", + "tool_name": "get_boiling_point_with_metadata" }, "type": "tool_call" }, @@ -54657,19 +42042,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 39 + "span_id": "mfrFN7m2", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:02.136501+00:00", + "__module__": "datetime" + }, + "trace_id": "T4eddr4-SMWPQwKA", + "type": "metric", + "unit": "tokens", + "value": 37 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 20 + "span_id": "mfrFN7m2", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:02.136529+00:00", + "__module__": "datetime" + }, + "trace_id": "T4eddr4-SMWPQwKA", + "type": "metric", + "unit": "tokens", + "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 59 + "span_id": "mfrFN7m2", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:02.136535+00:00", + "__module__": "datetime" + }, + "trace_id": "T4eddr4-SMWPQwKA", + "type": "metric", + "unit": "tokens", + "value": 67 } ] } @@ -54677,7 +42098,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -54705,7 +42126,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": "When", "type": "text" }, "event_type": { @@ -54725,7 +42146,7 @@ "data": { "event": { "delta": { - "text": "lama3", + "text": " I answered the", "type": "text" }, "event_type": { @@ -54745,7 +42166,7 @@ "data": { "event": { "delta": { - "text": "-", + "text": " phone, the friendly", "type": "text" }, "event_type": { @@ -54765,7 +42186,7 @@ "data": { "event": { "delta": { - "text": "8", + "text": " voice on the other end said \"hello\"", "type": "text" }, "event_type": { @@ -54785,7 +42206,7 @@ "data": { "event": { "delta": { - "text": "B uses", + "text": " and asked how I was doing.", "type": "text" }, "event_type": { @@ -54805,7 +42226,108 @@ "data": { "event": { "delta": { - "text": " grouped-query", + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "tJEuRhla", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:01.044284+00:00", + "__module__": "datetime" + }, + "trace_id": "bnDS7Z41TRO0UyfH", + "type": "metric", + "unit": "tokens", + "value": 30 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "tJEuRhla", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:01.044312+00:00", + "__module__": "datetime" + }, + "trace_id": "bnDS7Z41TRO0UyfH", + "type": "metric", + "unit": "tokens", + "value": 34 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "tJEuRhla", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:01.044318+00:00", + "__module__": "datetime" + }, + "trace_id": "bnDS7Z41TRO0UyfH", + "type": "metric", + "unit": "tokens", + "value": 64 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", "type": "text" }, "event_type": { @@ -54825,7 +42347,7 @@ "data": { "event": { "delta": { - "text": " attention instead", + "text": " am not able", "type": "text" }, "event_type": { @@ -54845,7 +42367,7 @@ "data": { "event": { "delta": { - "text": " of the", + "text": " to execute this task as", "type": "text" }, "event_type": { @@ -54865,7 +42387,7 @@ "data": { "event": { "delta": { - "text": " standard", + "text": " it exceeds the", "type": "text" }, "event_type": { @@ -54885,7 +42407,7 @@ "data": { "event": { "delta": { - "text": " multi-head", + "text": " limitations of the functions I", "type": "text" }, "event_type": { @@ -54905,7 +42427,7 @@ "data": { "event": { "delta": { - "text": " attention.", + "text": " have been given.", "type": "text" }, "event_type": { @@ -54942,19 +42464,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 80 + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 433 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 28 + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 31 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 108 + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 464 } ] } @@ -54962,7 +42520,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -54990,8 +42548,13 @@ "data": { "event": { "delta": { - "text": "L", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55010,8 +42573,13 @@ "data": { "event": { "delta": { - "text": "lama3", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf =", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55030,8 +42598,13 @@ "data": { "event": { "delta": { - "text": "-8", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " pd.read_csv(\"/var/folders/rb/qv8vwgyj", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55050,8 +42623,13 @@ "data": { "event": { "delta": { - "text": "B uses", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "6yjd3t4pwsy9t0rm0000", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55070,8 +42648,13 @@ "data": { "event": { "delta": { - "text": " grouped", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T/tmp2x_sml66/ZEjbinQHin", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55090,8 +42673,13 @@ "data": { "event": { "delta": { - "text": "-query attention", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.csv\")\n# Rows\nprint(\"Number of rows and columns in the", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55110,8 +42698,13 @@ "data": { "event": { "delta": { - "text": " instead of", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55130,8 +42723,13 @@ "data": { "event": { "delta": { - "text": " the standard", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55150,8 +42748,13 @@ "data": { "event": { "delta": { - "text": " multi", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55170,8 +42773,13 @@ "data": { "event": { "delta": { - "text": "-head attention", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55190,8 +42798,13 @@ "data": { "event": { "delta": { - "text": ".", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(df.head())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55204,6 +42817,45 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "1df8b196-9eff-4b06-97e7-ab175c741e8f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -55227,19 +42879,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 80 + "span_id": "fLqIbpek", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:40.262304+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 235 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 28 + "span_id": "fLqIbpek", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:40.262340+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 10 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 108 + "span_id": "fLqIbpek", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:40.262347+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 245 } ] } @@ -55247,7 +42935,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -55275,8 +42963,13 @@ "data": { "event": { "delta": { - "text": "[k", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55295,8 +42988,13 @@ "data": { "event": { "delta": { - "text": "nowledge_search", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf = pd", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55315,8 +43013,13 @@ "data": { "event": { "delta": { - "text": "(query=\"", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55335,8 +43038,13 @@ "data": { "event": { "delta": { - "text": "Llama", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "pwsy9t0rm0000gn/T/tmp2x_sml66/ZEj", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55355,8 +43063,13 @@ "data": { "event": { "delta": { - "text": "3-", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "binQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55375,8 +43088,13 @@ "data": { "event": { "delta": { - "text": "8B", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n#", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55395,8 +43113,13 @@ "data": { "event": { "delta": { - "text": " attention type", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55415,8 +43138,38 @@ "data": { "event": { "delta": { - "text": "\")]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Datatype of the columns are:\", df.dtypes)\n# Sample", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of data\nprint(\"Data sample from file:\")\nprint(df.head())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -55442,10 +43195,14 @@ }, "tool_call": { "arguments": { - "query": "Llama3-8B attention type" + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" }, - "call_id": "d7f3056e-3c5c-4bbc-869e-b617a35bdbb4", - "tool_name": "knowledge_search" + "call_id": "c1708ded-f272-4008-b91f-19d61780c394", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } }, "type": "tool_call" }, @@ -55487,19 +43244,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 40 + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305765+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 37 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 24 + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305820+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 10 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 64 + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305832+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 47 } ] } @@ -55507,7 +43300,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -55535,7 +43328,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "The", "type": "text" }, "event_type": { @@ -55555,7 +43348,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search", + "text": " error message indicates that the file \"/var/folders/rb/qv8", "type": "text" }, "event_type": { @@ -55575,7 +43368,7 @@ "data": { "event": { "delta": { - "text": "(query=\"", + "text": "vwgyj6yjd3t4pwsy9t0", "type": "text" }, "event_type": { @@ -55595,7 +43388,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": "rm0000gn/T/tmp2x_sml66/9vY", "type": "text" }, "event_type": { @@ -55615,7 +43408,7 @@ "data": { "event": { "delta": { - "text": "lama3", + "text": "vmVRoinflation.csv\" does not exist. This could be due to", "type": "text" }, "event_type": { @@ -55635,7 +43428,7 @@ "data": { "event": { "delta": { - "text": "-8", + "text": " a variety of reasons such as the file being deleted, the path being incorrect", "type": "text" }, "event_type": { @@ -55655,7 +43448,7 @@ "data": { "event": { "delta": { - "text": "B attention", + "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", "type": "text" }, "event_type": { @@ -55675,7 +43468,7 @@ "data": { "event": { "delta": { - "text": " type\")]", + "text": " try the following:\n\n1. Check the file path: Ensure that the file", "type": "text" }, "event_type": { @@ -55695,93 +43488,13 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "cd89b59f-7caa-4669-85c8-df6ba3892e77", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 40 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 24 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 64 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79080546, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05570498, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " path is correct and the file exists at that location.\n2. Check file permissions:", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -55795,7 +43508,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " Ensure that the file is accessible and you have the necessary permissions to", "type": "text" }, "event_type": { @@ -55815,7 +43528,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is", + "text": " read it.\n3. Try a different file: If the file is not", "type": "text" }, "event_type": { @@ -55835,7 +43548,7 @@ "data": { "event": { "delta": { - "text": " Mark", + "text": " accessible, try loading a different file to see if the issue is specific to", "type": "text" }, "event_type": { @@ -55855,7 +43568,7 @@ "data": { "event": { "delta": { - "text": " Zuckerberg.", + "text": " this file or a general issue with your code.\n4. Check for ty", "type": "text" }, "event_type": { @@ -55875,58 +43588,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 1235 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 19 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 1254 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": "pos: Ensure that there are no typos in the file path or the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -55940,7 +43608,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", "type": "text" }, "event_type": { @@ -55960,7 +43628,7 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", + "text": " you are using, and I'll be happy to help further.", "type": "text" }, "event_type": { @@ -56002,16 +43670,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "oB7hDf6E", + "span_id": "f28sT2i7", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084924+00:00", + "__datetime__": "2025-03-07T01:44:23.262530+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "8YKzpfybSiGgrHOF", "type": "metric", "unit": "tokens", - "value": 1145 + "value": 680 }, { "attributes": { @@ -56019,16 +43687,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "oB7hDf6E", + "span_id": "f28sT2i7", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084934+00:00", + "__datetime__": "2025-03-07T01:44:23.262555+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "8YKzpfybSiGgrHOF", "type": "metric", "unit": "tokens", - "value": 19 + "value": 238 }, { "attributes": { @@ -56036,16 +43704,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "oB7hDf6E", + "span_id": "f28sT2i7", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084936+00:00", + "__datetime__": "2025-03-07T01:44:23.262558+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "8YKzpfybSiGgrHOF", "type": "metric", "unit": "tokens", - "value": 1164 + "value": 918 } ] } @@ -56053,7 +43721,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -56111,7 +43779,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "brave", + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", "type": "tool_call" }, "event_type": { @@ -56136,7 +43804,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "_search.call", + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", "type": "tool_call" }, "event_type": { @@ -56161,7 +43829,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "(query=\"", + "tool_call": "8vwgyj6yjd3t4pwsy9t", "type": "tool_call" }, "event_type": { @@ -56186,7 +43854,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "current", + "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", "type": "tool_call" }, "event_type": { @@ -56211,7 +43879,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " CEO of", + "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", "type": "tool_call" }, "event_type": { @@ -56236,7 +43904,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " Meta\")", + "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", "type": "tool_call" }, "event_type": { @@ -56259,19 +43927,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "current CEO of Meta" - }, - "call_id": "112fe886-dc25-4347-9b54-b52571b0cdb5", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "brave_search" - } + "value": "in_progress" }, + "tool_call": "'], format='%Y')\n\n# Group by", "type": "tool_call" }, "event_type": { @@ -56280,11 +43938,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -56295,58 +43949,18 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 34 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 10 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 44 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -56360,8 +43974,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56380,8 +43999,13 @@ "data": { "event": { "delta": { - "text": " boiling point", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation as a time series\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56400,8 +44024,13 @@ "data": { "event": { "delta": { - "text": " of poly", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56420,8 +44049,13 @@ "data": { "event": { "delta": { - "text": "ju", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df_avg_in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56440,8 +44074,13 @@ "data": { "event": { "delta": { - "text": "ice", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Inflation'], marker='o')\nplt", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56460,8 +44099,13 @@ "data": { "event": { "delta": { - "text": " is -", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56480,8 +44124,13 @@ "data": { "event": { "delta": { - "text": "100 degrees", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56500,8 +44149,23 @@ "data": { "event": { "delta": { - "text": " Celsius.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -56509,7 +44173,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -56537,19 +44205,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 77 + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953806+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 432 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 23 + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953843+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 10 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 100 + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953847+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 442 } ] } @@ -56557,7 +44261,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -56605,7 +44309,7 @@ "data": { "event": { "delta": { - "text": " boiling point", + "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", "type": "text" }, "event_type": { @@ -56625,7 +44329,7 @@ "data": { "event": { "delta": { - "text": " of poly", + "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", "type": "text" }, "event_type": { @@ -56645,7 +44349,7 @@ "data": { "event": { "delta": { - "text": "juice", + "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", "type": "text" }, "event_type": { @@ -56665,7 +44369,7 @@ "data": { "event": { "delta": { - "text": " is -", + "text": " due to a variety of", "type": "text" }, "event_type": { @@ -56685,7 +44389,7 @@ "data": { "event": { "delta": { - "text": "100 degrees", + "text": " reasons such as the file", "type": "text" }, "event_type": { @@ -56705,7 +44409,7 @@ "data": { "event": { "delta": { - "text": " Celsius.", + "text": " being deleted, the path being incorrect, or the file", "type": "text" }, "event_type": { @@ -56725,58 +44429,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 77 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 23 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " not being accessible.\n\nTo resolve this issue, you can try", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -56790,7 +44449,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " the following:\n\n1. Check the file path: Ensure that", "type": "text" }, "event_type": { @@ -56810,7 +44469,7 @@ "data": { "event": { "delta": { - "text": " boiling point", + "text": " the file path is correct and the file exists at that", "type": "text" }, "event_type": { @@ -56830,7 +44489,7 @@ "data": { "event": { "delta": { - "text": " of poly", + "text": " location.\n2. Check file permissions: Ensure that", "type": "text" }, "event_type": { @@ -56850,7 +44509,7 @@ "data": { "event": { "delta": { - "text": "juice", + "text": " the file is accessible and", "type": "text" }, "event_type": { @@ -56870,7 +44529,7 @@ "data": { "event": { "delta": { - "text": " is", + "text": " you have the necessary permissions to read", "type": "text" }, "event_type": { @@ -56890,7 +44549,7 @@ "data": { "event": { "delta": { - "text": " -", + "text": " it.\n3. Try a different file: If", "type": "text" }, "event_type": { @@ -56910,7 +44569,7 @@ "data": { "event": { "delta": { - "text": "100 degrees", + "text": " the file is not accessible, try loading a different file to see", "type": "text" }, "event_type": { @@ -56930,7 +44589,7 @@ "data": { "event": { "delta": { - "text": " Celsius.", + "text": " if the issue is specific to this file or a general", "type": "text" }, "event_type": { @@ -56950,58 +44609,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 77 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 23 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " issue with your code.\n", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -57015,7 +44629,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "4. Check for typos: Ensure that", "type": "text" }, "event_type": { @@ -57035,7 +44649,7 @@ "data": { "event": { "delta": { - "text": " function call should be", + "text": " there are no typos in the file path or the code.\n\n", "type": "text" }, "event_type": { @@ -57055,7 +44669,7 @@ "data": { "event": { "delta": { - "text": ":\n[", + "text": "If you are", "type": "text" }, "event_type": { @@ -57075,7 +44689,7 @@ "data": { "event": { "delta": { - "text": "get", + "text": " still having issues, please provide more details about", "type": "text" }, "event_type": { @@ -57095,7 +44709,7 @@ "data": { "event": { "delta": { - "text": "_boiling_point(liquid_name='polyjuice', celci", + "text": " the file and the code you are using", "type": "text" }, "event_type": { @@ -57115,7 +44729,7 @@ "data": { "event": { "delta": { - "text": "us=True)]", + "text": ", and I'll be happy to help further.", "type": "text" }, "event_type": { @@ -57157,16 +44771,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "JN7UZs_c", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473221+00:00", + "__datetime__": "2025-03-07T01:44:19.630894+00:00", "__module__": "datetime" }, - "trace_id": "H3r-_Zh-TVqtSp7k", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 86 + "value": 192 }, { "attributes": { @@ -57174,16 +44788,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "JN7UZs_c", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473254+00:00", + "__datetime__": "2025-03-07T01:44:19.630987+00:00", "__module__": "datetime" }, - "trace_id": "H3r-_Zh-TVqtSp7k", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 34 + "value": 238 }, { "attributes": { @@ -57191,16 +44805,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "JN7UZs_c", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:42.473261+00:00", + "__datetime__": "2025-03-07T01:44:19.630996+00:00", "__module__": "datetime" }, - "trace_id": "H3r-_Zh-TVqtSp7k", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 120 + "value": 430 } ] } @@ -57208,7 +44822,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -57236,8 +44850,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57256,8 +44875,13 @@ "data": { "event": { "delta": { - "text": " function `get_boiling_point`", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57276,8 +44900,13 @@ "data": { "event": { "delta": { - "text": " is not a real function and cannot be", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57296,8 +44925,13 @@ "data": { "event": { "delta": { - "text": " used to determine the boiling point of polyju", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "jd3t4pwsy9t0rm0000gn/T", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57316,8 +44950,13 @@ "data": { "event": { "delta": { - "text": "ice. Polyjuice is a fictional substance from the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmp2x_sml66/9vYvmVRoinflation.csv", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57336,8 +44975,13 @@ "data": { "event": { "delta": { - "text": " Harry Potter series and does not have a real-world boiling", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57356,8 +45000,13 @@ "data": { "event": { "delta": { - "text": " point. If you have any other questions or need help", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Print information about the dataframe\nprint(df", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57376,8 +45025,13 @@ "data": { "event": { "delta": { - "text": " with a different topic, feel free to ask!", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".info())\n\n# Print summary statistics about the dataframe\nprint(df.describe", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -57390,6 +45044,70 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())" + }, + "call_id": "5bbfebeb-4360-4ef9-a9e2-4227a8e8c699", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -57418,16 +45136,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "aCPTIc0d", + "span_id": "AyEX3So6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227208+00:00", + "__datetime__": "2025-03-07T01:44:17.873486+00:00", "__module__": "datetime" }, - "trace_id": "4DRyVE86RpCeqfpE", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 86 + "value": 36 }, { "attributes": { @@ -57435,16 +45153,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "aCPTIc0d", + "span_id": "AyEX3So6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227251+00:00", + "__datetime__": "2025-03-07T01:44:17.873500+00:00", "__module__": "datetime" }, - "trace_id": "4DRyVE86RpCeqfpE", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 78 + "value": 10 }, { "attributes": { @@ -57452,16 +45170,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "aCPTIc0d", + "span_id": "AyEX3So6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:53:27.227258+00:00", + "__datetime__": "2025-03-07T01:44:17.873503+00:00", "__module__": "datetime" }, - "trace_id": "4DRyVE86RpCeqfpE", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 164 + "value": 46 } ] } @@ -57469,7 +45187,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -58078,47 +45796,7 @@ "data": { "event": { "delta": { - "text": "[", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "get", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "_bo", + "text": "[k", "type": "text" }, "event_type": { @@ -58138,7 +45816,7 @@ "data": { "event": { "delta": { - "text": "iling_point", + "text": "nowledge_search(query=\"using LoRA in Torchtune", "type": "text" }, "event_type": { @@ -58158,7 +45836,7 @@ "data": { "event": { "delta": { - "text": "(", + "text": "\")]", "type": "text" }, "event_type": { @@ -58178,8 +45856,19 @@ "data": { "event": { "delta": { - "text": "liquid", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -58187,7 +45876,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -58198,53 +45891,94 @@ "data": { "event": { "delta": { - "text": "_name='", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "polyju", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673350+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 107 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673375+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 23 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673381+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 130 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "ice',", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -58258,7 +45992,7 @@ "data": { "event": { "delta": { - "text": " celci", + "text": "I", "type": "text" }, "event_type": { @@ -58278,7 +46012,7 @@ "data": { "event": { "delta": { - "text": "us=True", + "text": "'m ready to help. What's", "type": "text" }, "event_type": { @@ -58298,7 +46032,7 @@ "data": { "event": { "delta": { - "text": ")]", + "text": " your question about Torchtune?", "type": "text" }, "event_type": { @@ -58312,42 +46046,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "dcf85480-0aa4-4f86-9720-6c030aa67344", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -58371,19 +46069,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 30 + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179269+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 75 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 28 + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179301+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 25 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 58 + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179308+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 100 } ] } @@ -58391,7 +46125,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " substance,", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -59040,7 +46774,7 @@ "data": { "event": { "delta": { - "text": " it doesn", + "text": "[k", "type": "text" }, "event_type": { @@ -59060,7 +46794,7 @@ "data": { "event": { "delta": { - "text": "'t", + "text": "nowledge_search(query=\"using LoRA in Torchtune", "type": "text" }, "event_type": { @@ -59080,7 +46814,7 @@ "data": { "event": { "delta": { - "text": " have a", + "text": "\")]", "type": "text" }, "event_type": { @@ -59100,8 +46834,19 @@ "data": { "event": { "delta": { - "text": " boiling", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -59109,7 +46854,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -59120,33 +46869,94 @@ "data": { "event": { "delta": { - "text": " point.", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209198+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 108 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209239+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209247+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 131 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " Poly", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -59160,7 +46970,7 @@ "data": { "event": { "delta": { - "text": "juice", + "text": "I", "type": "text" }, "event_type": { @@ -59180,7 +46990,7 @@ "data": { "event": { "delta": { - "text": " Potion", + "text": "'m ready to help. What's", "type": "text" }, "event_type": { @@ -59200,7 +47010,7 @@ "data": { "event": { "delta": { - "text": " is a", + "text": " your first question about Torchtune", "type": "text" }, "event_type": { @@ -59220,7 +47030,7 @@ "data": { "event": { "delta": { - "text": " magical", + "text": "?", "type": "text" }, "event_type": { @@ -59240,33 +47050,94 @@ "data": { "event": { "delta": { - "text": " concoction", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525734+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 75 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525763+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 26 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "mYTkxvK_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:23.525770+00:00", + "__module__": "datetime" + }, + "trace_id": "kpcdkZQ2SsSOh9Lw", + "type": "metric", + "unit": "tokens", + "value": 101 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " that allows", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -59280,7 +47151,7 @@ "data": { "event": { "delta": { - "text": " the drink", + "text": "[k", "type": "text" }, "event_type": { @@ -59300,7 +47171,7 @@ "data": { "event": { "delta": { - "text": "er to", + "text": "nowledge_search(query=\"Tor", "type": "text" }, "event_type": { @@ -59320,7 +47191,7 @@ "data": { "event": { "delta": { - "text": " assume the", + "text": "chtune documentation\")]", "type": "text" }, "event_type": { @@ -59340,8 +47211,19 @@ "data": { "event": { "delta": { - "text": " form", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Torchtune documentation" + }, + "call_id": "385cbde8-19e8-4c8b-84ca-b75050b3666b", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -59349,7 +47231,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -59360,53 +47246,94 @@ "data": { "event": { "delta": { - "text": " and appearance", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " of", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "-7YS2sLl", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:30.668846+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 39 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "-7YS2sLl", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:30.668859+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 20 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "-7YS2sLl", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:30.668861+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 59 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " another person", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -59420,7 +47347,7 @@ "data": { "event": { "delta": { - "text": ", but", + "text": "L", "type": "text" }, "event_type": { @@ -59440,7 +47367,7 @@ "data": { "event": { "delta": { - "text": " it's", + "text": "lama3-8B uses grouped-query", "type": "text" }, "event_type": { @@ -59460,7 +47387,7 @@ "data": { "event": { "delta": { - "text": " not a", + "text": " attention instead of", "type": "text" }, "event_type": { @@ -59480,7 +47407,7 @@ "data": { "event": { "delta": { - "text": " physical substance", + "text": " the standard multi-head attention.", "type": "text" }, "event_type": { @@ -59500,53 +47427,94 @@ "data": { "event": { "delta": { - "text": " that", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " can be", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "1eIEdjPP", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:18.982970+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 80 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "1eIEdjPP", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:18.983000+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "1eIEdjPP", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:18.983005+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " measured or", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -59560,7 +47528,7 @@ "data": { "event": { "delta": { - "text": " analyzed in", + "text": "L", "type": "text" }, "event_type": { @@ -59580,7 +47548,7 @@ "data": { "event": { "delta": { - "text": " the same", + "text": "lama3-8B uses grouped-query attention instead of", "type": "text" }, "event_type": { @@ -59600,7 +47568,7 @@ "data": { "event": { "delta": { - "text": " way as", + "text": " the standard", "type": "text" }, "event_type": { @@ -59620,7 +47588,7 @@ "data": { "event": { "delta": { - "text": " real-world", + "text": " multi-head attention.", "type": "text" }, "event_type": { @@ -59640,53 +47608,94 @@ "data": { "event": { "delta": { - "text": " chemicals.\n\n", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "If you", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884663+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 80 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884753+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884760+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 108 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " have any", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -59700,7 +47709,7 @@ "data": { "event": { "delta": { - "text": " other questions", + "text": "[k", "type": "text" }, "event_type": { @@ -59720,7 +47729,7 @@ "data": { "event": { "delta": { - "text": " or if", + "text": "nowledge_search(query=\"Llama3-8", "type": "text" }, "event_type": { @@ -59740,7 +47749,7 @@ "data": { "event": { "delta": { - "text": " there", + "text": "B attention type\")]", "type": "text" }, "event_type": { @@ -59760,8 +47769,19 @@ "data": { "event": { "delta": { - "text": "'s anything", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "4901bbdf-8faf-4a57-b6f6-01688c6290e6", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -59769,7 +47789,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -59780,53 +47804,94 @@ "data": { "event": { "delta": { - "text": " else I", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " can help", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412559+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 40 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412607+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 24 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412615+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 64 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " you", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -59840,7 +47905,7 @@ "data": { "event": { "delta": { - "text": " with,", + "text": "[k", "type": "text" }, "event_type": { @@ -59860,7 +47925,7 @@ "data": { "event": { "delta": { - "text": " feel free", + "text": "nowledge_search(query=\"Llama3-8B attention", "type": "text" }, "event_type": { @@ -59880,7 +47945,7 @@ "data": { "event": { "delta": { - "text": " to ask", + "text": " type\")]", "type": "text" }, "event_type": { @@ -59900,8 +47965,19 @@ "data": { "event": { "delta": { - "text": "!", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "dd056386-b105-47e5-bd85-07e5ae096de1", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -59909,7 +47985,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -59937,19 +48017,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 30 + "span_id": "yjKrmpeo", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.041566+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 40 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 113 + "span_id": "yjKrmpeo", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.041591+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 24 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 143 + "span_id": "yjKrmpeo", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.041597+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 64 } ] } @@ -59957,7 +48073,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -59985,27 +48101,7 @@ "data": { "event": { "delta": { - "text": "[", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "get_bo", + "text": "The", "type": "text" }, "event_type": { @@ -60025,7 +48121,7 @@ "data": { "event": { "delta": { - "text": "iling_point(liquid_name='poly", + "text": " current CEO of Meta is Mark Zuckerberg.", "type": "text" }, "event_type": { @@ -60045,53 +48141,94 @@ "data": { "event": { "delta": { - "text": "ju", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "ice', cel", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084924+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1145 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084934+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 19 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084936+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1164 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "cius", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -60105,8 +48242,13 @@ "data": { "event": { "delta": { - "text": "=True", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60125,8 +48267,13 @@ "data": { "event": { "delta": { - "text": ")]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "brave_search.call(query=\"current CEO of Meta\")", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60152,11 +48299,14 @@ }, "tool_call": { "arguments": { - "celcius": true, - "liquid_name": "polyjuice" + "query": "current CEO of Meta" }, - "call_id": "81e4629c-9ed7-4fda-b0fd-8db41cd00407", - "tool_name": "get_boiling_point" + "call_id": "535c272b-768b-44fe-b303-2eae022f67f5", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } }, "type": "tool_call" }, @@ -60198,19 +48348,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 30 + "span_id": "AZ60Ocso", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:03.907918+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 34 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 28 + "span_id": "AZ60Ocso", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:03.907933+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 10 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 58 + "span_id": "AZ60Ocso", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:03.907936+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 44 } ] } @@ -60218,7 +48404,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -60266,7 +48452,7 @@ "data": { "event": { "delta": { - "text": " 100th prime number is 541", + "text": " boiling point of polyjuice is -100 degrees Celsius", "type": "text" }, "event_type": { @@ -60328,16 +48514,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "bxIams_G", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404182+00:00", + "__datetime__": "2025-03-07T02:04:33.852666+00:00", "__module__": "datetime" }, - "trace_id": "snO106yxStaL10ow", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 252 + "value": 77 }, { "attributes": { @@ -60345,16 +48531,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "bxIams_G", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404224+00:00", + "__datetime__": "2025-03-07T02:04:33.852692+00:00", "__module__": "datetime" }, - "trace_id": "snO106yxStaL10ow", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 20 + "value": 23 }, { "attributes": { @@ -60362,16 +48548,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "bxIams_G", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404230+00:00", + "__datetime__": "2025-03-07T02:04:33.852699+00:00", "__module__": "datetime" }, - "trace_id": "snO106yxStaL10ow", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 272 + "value": 100 } ] } @@ -60379,7 +48565,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n == 2:\\n return True\\n if n % 2 == 0:\\n return False\\n max_divisor = int(n**0.5) + 1\\n for d in range(3, max_divisor, 2):\\n if n % d == 0:\\n return False\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -60427,87 +48613,7 @@ "data": { "event": { "delta": { - "text": " 100", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "th prime", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " number is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " 541", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", + "text": " boiling point of polyjuice is -100 degrees Celsius.", "type": "text" }, "event_type": { @@ -60544,19 +48650,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 243 + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.617998+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 77 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 20 + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.618030+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 23 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 263 + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.618036+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 100 } ] } @@ -60564,7 +48706,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -60592,13 +48734,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60617,13 +48754,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "def is", - "type": "tool_call" + "text": " function get_boiling_point is not", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60642,13 +48774,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_prime(n", - "type": "tool_call" + "text": " able", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60667,13 +48794,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "):\n", - "type": "tool_call" + "text": " to find the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60692,13 +48814,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " ", - "type": "tool_call" + "text": " boiling point of \"polyjuice\" as", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60717,13 +48834,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " if n", - "type": "tool_call" + "text": " it", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60742,13 +48854,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " <= ", - "type": "tool_call" + "text": " is not a real liquid", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60767,13 +48874,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "1", - "type": "tool_call" + "text": ". Polyju", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60792,13 +48894,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\n ", - "type": "tool_call" + "text": "ice is a fictional substance from the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60817,13 +48914,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return False", - "type": "tool_call" + "text": " Harry Potter series.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60842,43 +48934,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n ", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232189+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 77 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232325+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 51 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232334+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 128 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " if n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -60892,13 +49035,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " == ", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60917,13 +49055,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "2:\n", - "type": "tool_call" + "text": " function call should be", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60942,13 +49075,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " ", - "type": "tool_call" + "text": ":\n[", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60967,13 +49095,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return True", - "type": "tool_call" + "text": "get", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -60992,13 +49115,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n ", - "type": "tool_call" + "text": "_boiling_point(liquid_name='polyjuice', celci", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61017,13 +49135,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " if n", - "type": "tool_call" + "text": "us=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61042,43 +49155,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " % ", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473221+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 86 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473254+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 34 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "JN7UZs_c", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:42.473261+00:00", + "__module__": "datetime" + }, + "trace_id": "H3r-_Zh-TVqtSp7k", + "type": "metric", + "unit": "tokens", + "value": 120 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "2 ==", - "type": "tool_call" + "event": { + "delta": { + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -61092,13 +49256,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 0", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61117,13 +49276,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\n ", - "type": "tool_call" + "text": " function `get_boiling_point`", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61142,13 +49296,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return", - "type": "tool_call" + "text": " is not a real function and cannot be", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61167,13 +49316,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " False", - "type": "tool_call" + "text": " used to determine the boiling point of polyju", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61192,13 +49336,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n ", - "type": "tool_call" + "text": "ice. Polyjuice is a fictional substance from the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61217,13 +49356,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " max_div", - "type": "tool_call" + "text": " Harry Potter series and does not have a real-world boiling", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61242,13 +49376,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "isor", - "type": "tool_call" + "text": " point. If you have any other questions or need help", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61267,13 +49396,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " =", - "type": "tool_call" + "text": " with a different topic, feel free to ask!", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61292,43 +49416,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " int(n", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "aCPTIc0d", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:53:27.227208+00:00", + "__module__": "datetime" + }, + "trace_id": "4DRyVE86RpCeqfpE", + "type": "metric", + "unit": "tokens", + "value": 86 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "aCPTIc0d", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:53:27.227251+00:00", + "__module__": "datetime" + }, + "trace_id": "4DRyVE86RpCeqfpE", + "type": "metric", + "unit": "tokens", + "value": 78 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "aCPTIc0d", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:53:27.227258+00:00", + "__module__": "datetime" + }, + "trace_id": "4DRyVE86RpCeqfpE", + "type": "metric", + "unit": "tokens", + "value": 164 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "**0", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -61342,13 +49517,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61367,13 +49537,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "5)", - "type": "tool_call" + "text": " function call should be in the following format", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61392,13 +49557,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " + ", - "type": "tool_call" + "text": ": [function_name(parameters)]. However", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61417,13 +49577,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "1\n", - "type": "tool_call" + "text": ", the function get_boiling_point is not recognized", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61442,13 +49597,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " for", - "type": "tool_call" + "text": ". If the function", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61467,13 +49617,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " d in", - "type": "tool_call" + "text": " is supposed to return the boiling point of a liquid, it should be defined", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61492,13 +49637,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " range(", - "type": "tool_call" + "text": " before it can be used. \n\nIn this", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61517,13 +49657,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "3,", - "type": "tool_call" + "text": " case, I will assume that the function get_boiling_point is defined as", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61542,13 +49677,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " max_div", - "type": "tool_call" + "text": " follows:\ndef get", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61567,13 +49697,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "isor,", - "type": "tool_call" + "text": "_boiling_point(liquid_name, celcius=True):\n # This", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61592,13 +49717,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 2", - "type": "tool_call" + "text": " function returns the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61617,13 +49737,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "):\n ", - "type": "tool_call" + "text": " boiling point of a liquid in Celcius or Fahrenheit\n boiling_points", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61642,13 +49757,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " if n", - "type": "tool_call" + "text": " = {\n \"water\": 100,\n \"polyjuice\":", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61667,13 +49777,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " % d", - "type": "tool_call" + "text": " 120 # Assuming poly", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61692,13 +49797,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " == ", - "type": "tool_call" + "text": "juice has a boiling point of 120 degrees Cel", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61717,13 +49817,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0:\n", - "type": "tool_call" + "text": "cius\n }\n if liquid", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61742,13 +49837,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return", - "type": "tool_call" + "text": "_name in boiling_points:\n if celcius:\n return", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61767,13 +49857,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " False", - "type": "tool_call" + "text": " boiling_points[liquid_name]\n else:\n return boiling_points[liquid", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61792,13 +49877,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n", - "type": "tool_call" + "text": "_name] * 9/5 + ", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61817,13 +49897,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return", - "type": "tool_call" + "text": "32\n else:\n return \"Boiling point not found", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61842,13 +49917,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " True\n\n", - "type": "tool_call" + "text": "\"\n\nNow, the function call", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61867,13 +49937,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "def nth", - "type": "tool_call" + "text": " should be: \n", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61892,13 +49957,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_prime(n", - "type": "tool_call" + "text": "[get_boiling_point(liquid_name=\"polyju", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61917,13 +49977,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "):\n ", - "type": "tool_call" + "text": "ice\", celcius=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -61942,43 +49997,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " count", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "NnkGeCwM", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:35.213901+00:00", + "__module__": "datetime" + }, + "trace_id": "7ifSRjCjRIioDOte", + "type": "metric", + "unit": "tokens", + "value": 86 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "NnkGeCwM", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:35.213925+00:00", + "__module__": "datetime" + }, + "trace_id": "7ifSRjCjRIioDOte", + "type": "metric", + "unit": "tokens", + "value": 234 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "NnkGeCwM", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:35.213931+00:00", + "__module__": "datetime" + }, + "trace_id": "7ifSRjCjRIioDOte", + "type": "metric", + "unit": "tokens", + "value": 320 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " =", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -61992,13 +50098,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 0", - "type": "tool_call" + "text": "[", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62017,13 +50118,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n ", - "type": "tool_call" + "text": "get_boiling_point(liquid_name='polyjuice", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62042,13 +50138,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " num =", - "type": "tool_call" + "text": "', celcius=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62070,9 +50161,16 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "d43b2636-903d-430d-8389-91eefe5a1d75", + "tool_name": "get_boiling_point" }, - "tool_call": " 2", "type": "tool_call" }, "event_type": { @@ -62081,7 +50179,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -62092,43 +50194,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221646+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", + "value": 30 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221673+00:00", + "__module__": "datetime" }, - "tool_call": "\n ", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221680+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", + "value": 58 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " while True", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -62142,13 +50295,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\n", - "type": "tool_call" + "text": "[", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62167,13 +50315,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " ", - "type": "tool_call" + "text": "get_boiling_point(liquid_name", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62192,13 +50335,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " if is", - "type": "tool_call" + "text": "='polyjuice', celcius=True)]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62220,9 +50358,16 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "0548b2ef-daa4-4099-bb2c-b34f00752339", + "tool_name": "get_boiling_point" }, - "tool_call": "_prime(num", "type": "tool_call" }, "event_type": { @@ -62231,7 +50376,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -62242,68 +50391,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "):\n ", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" }, - "tool_call": " count +=", - "type": "tool_call" + "metric": "prompt_tokens", + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366139+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", + "value": 30 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366166+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", + "value": 28 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366172+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", + "value": 58 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 1", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -62317,13 +50492,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n ", - "type": "tool_call" + "text": "Poly", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62342,13 +50512,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " if count", - "type": "tool_call" + "text": "juice is a fictional potion from", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62367,13 +50532,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " == n", - "type": "tool_call" + "text": " the Harry Potter series by J.K. Rowling. As it", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62392,13 +50552,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ":\n ", - "type": "tool_call" + "text": "'s not a real substance, it doesn't have a boiling point", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62417,13 +50572,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " return num", - "type": "tool_call" + "text": ". Polyjuice Potion is a magical concoction", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62442,13 +50592,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n ", - "type": "tool_call" + "text": " that allows the drinker to assume the form and", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62467,13 +50612,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " num +=", - "type": "tool_call" + "text": " appearance", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62492,13 +50632,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 1", - "type": "tool_call" + "text": " of another person, but it's not a physical substance that can", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62517,13 +50652,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "\n\nprint", - "type": "tool_call" + "text": " be measured or analyzed in the same way as real-world", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62542,13 +50672,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "(nth", - "type": "tool_call" + "text": " chemicals.\n\nIf you", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62567,13 +50692,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_prime(", - "type": "tool_call" + "text": " have any other questions or", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62592,13 +50712,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "100))", - "type": "tool_call" + "text": " if there's anything else I can help you with, feel free to ask", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62617,23 +50732,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "def is_prime(n):\n if n <= 1:\n return False\n if n == 2:\n return True\n if n % 2 == 0:\n return False\n max_divisor = int(n**0.5) + 1\n for d in range(3, max_divisor, 2):\n if n % d == 0:\n return False\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" - }, - "call_id": "67e0a41c-4428-47a9-b276-df436c014992", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": "!", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62641,11 +50741,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -62673,19 +50769,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 40 + "span_id": "M0oC9v8Y", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:30.531648+00:00", + "__module__": "datetime" + }, + "trace_id": "0CMlh2kQShSVm3zE", + "type": "metric", + "unit": "tokens", + "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 10 + "span_id": "M0oC9v8Y", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:30.531666+00:00", + "__module__": "datetime" + }, + "trace_id": "0CMlh2kQShSVm3zE", + "type": "metric", + "unit": "tokens", + "value": 113 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 50 + "span_id": "M0oC9v8Y", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:30.531671+00:00", + "__module__": "datetime" + }, + "trace_id": "0CMlh2kQShSVm3zE", + "type": "metric", + "unit": "tokens", + "value": 143 } ] } @@ -62693,7 +50825,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity the company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -62721,67 +50853,7 @@ "data": { "event": { "delta": { - "text": "Per", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "plexity", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the company", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " was founded", + "text": "[", "type": "text" }, "event_type": { @@ -62801,7 +50873,7 @@ "data": { "event": { "delta": { - "text": " in ", + "text": "get_boiling_point(liquid_name='polyjuice', cel", "type": "text" }, "event_type": { @@ -62821,7 +50893,7 @@ "data": { "event": { "delta": { - "text": "2022", + "text": "cius=True)]", "type": "text" }, "event_type": { @@ -62841,8 +50913,20 @@ "data": { "event": { "delta": { - "text": ".", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "acbb04a1-08f4-4277-9b66-aadda2fa2be7", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -62850,7 +50934,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -62878,19 +50966,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 68 + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175063+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", + "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 22 + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175128+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", + "value": 28 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 90 + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175137+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", + "value": 58 } ] } @@ -62898,7 +51022,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n <= 3:\\n return True\\n if n % 2 == 0 or n % 3 == 0:\\n return False\\n i = 5\\n while i * i <= n:\\n if n % i == 0 or n % (i + 2) == 0:\\n return False\\n i += 6\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -62926,7 +51050,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "The", "type": "text" }, "event_type": { @@ -62946,7 +51070,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search", + "text": " 100th prime number is 541", "type": "text" }, "event_type": { @@ -62966,7 +51090,7 @@ "data": { "event": { "delta": { - "text": "(query=\"", + "text": ".", "type": "text" }, "event_type": { @@ -62986,33 +51110,94 @@ "data": { "event": { "delta": { - "text": "Perplex", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404182+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 252 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404224+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 20 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404230+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 272 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Write code and execute it to find the answer for: What is the 100th prime number?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "ity the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -63026,8 +51211,13 @@ "data": { "event": { "delta": { - "text": " company founding", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63046,8 +51236,13 @@ "data": { "event": { "delta": { - "text": " date\")]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "def is_prime(n):\n if n <= 1:\n return False", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63069,15 +51264,9 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Perplexity the company founding date" - }, - "call_id": "1d69f6cc-160a-47d8-a452-4deb322954fd", - "tool_name": "knowledge_search" + "value": "in_progress" }, + "tool_call": "\n if n <= 3:\n return True", "type": "tool_call" }, "event_type": { @@ -63086,11 +51275,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -63101,58 +51286,18 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 29 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 23 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 52 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"NBA creation date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\n if n % 2 == 0 or n % 3", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -63166,8 +51311,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " == 0:\n return False\n i = 5\n ", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63186,8 +51336,13 @@ "data": { "event": { "delta": { - "text": " NBA was", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " while i * i <= n:\n if n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63206,8 +51361,13 @@ "data": { "event": { "delta": { - "text": " created on", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " % i == 0 or n % (i", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63226,8 +51386,13 @@ "data": { "event": { "delta": { - "text": " August ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " + 2) == 0:\n return False\n i +=", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63246,8 +51411,13 @@ "data": { "event": { "delta": { - "text": "3", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 6\n return True\n\ndef nth_prime(n):\n count =", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63266,8 +51436,13 @@ "data": { "event": { "delta": { - "text": ", ", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 0\n num = 2\n while True:\n if", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63286,8 +51461,13 @@ "data": { "event": { "delta": { - "text": "1949", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " is_prime(num):\n count += 1\n if count == n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63306,8 +51486,13 @@ "data": { "event": { "delta": { - "text": ", with", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\n return num\n num += 1\n\nprint(nth_prime", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63326,8 +51511,13 @@ "data": { "event": { "delta": { - "text": " the merger", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(100))", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63346,8 +51536,23 @@ "data": { "event": { "delta": { - "text": " of", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" + }, + "call_id": "e1110bc1-dc83-480d-ad33-09d49f5ccc8d", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63355,7 +51560,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -63366,33 +51575,94 @@ "data": { "event": { "delta": { - "text": " the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "5J3hM-La", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:09.121100+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 40 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "5J3hM-La", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:09.121127+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "5J3hM-La", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:09.121132+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", + "value": 50 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Perplexity the company founding date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " Basketball Association", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -63406,7 +51676,7 @@ "data": { "event": { "delta": { - "text": " of America", + "text": "Per", "type": "text" }, "event_type": { @@ -63426,7 +51696,7 @@ "data": { "event": { "delta": { - "text": " (B", + "text": "plexity the company was founded in 2022.", "type": "text" }, "event_type": { @@ -63446,33 +51716,94 @@ "data": { "event": { "delta": { - "text": "AA)", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "6jxCq3gU", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:50.430436+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", + "value": 68 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "6jxCq3gU", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:50.430477+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", + "value": 22 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "6jxCq3gU", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:50.430489+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", + "value": 90 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was Perplexity the company founded?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " and the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -63486,7 +51817,7 @@ "data": { "event": { "delta": { - "text": " National", + "text": "[k", "type": "text" }, "event_type": { @@ -63506,7 +51837,7 @@ "data": { "event": { "delta": { - "text": " Basketball League", + "text": "nowledge_search(query=\"Perplexity the company", "type": "text" }, "event_type": { @@ -63526,7 +51857,7 @@ "data": { "event": { "delta": { - "text": " (N", + "text": " founding date\")]", "type": "text" }, "event_type": { @@ -63546,8 +51877,19 @@ "data": { "event": { "delta": { - "text": "BL).", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Perplexity the company founding date" + }, + "call_id": "199ef050-bc11-4e4b-935d-f5241c3f40ef", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -63555,7 +51897,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -63583,19 +51929,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 63 + "span_id": "m4wMGuSN", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:49.880525+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", + "value": 29 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 45 + "span_id": "m4wMGuSN", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:49.880576+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", + "value": 23 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 108 + "span_id": "m4wMGuSN", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:49.880585+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", + "value": 52 } ] } @@ -63603,7 +51985,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"NBA creation date\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 3 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:nba_w\\nContent: The NBA was created on August 3, 1949, with the merger of the Basketball Association of America (BAA) and the National Basketball League (NBL).\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:perpl\\nContent: Perplexity the company was founded in 2022 by Aravind Srinivas, Andy Konwinski, Denis Yarats and Johnny Ho, engineers with backgrounds in back-end systems, artificial intelligence (AI) and machine learning:\\n\\n Srinivas, the CEO, worked at OpenAI as an AI researcher.\\n Konwinski was among the founding team at Databricks.\\n Yarats, the CTO, was an AI research scientist at Meta.\\n Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:perpl\\nContent: Ho, the CSO, worked as an engineer at Quora, then as a quantitative trader on Wall Street.[5]\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -63631,7 +52013,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "The", "type": "text" }, "event_type": { @@ -63651,7 +52033,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search", + "text": " NBA was created on August 3, 1949, with", "type": "text" }, "event_type": { @@ -63671,7 +52053,7 @@ "data": { "event": { "delta": { - "text": "(query=\"", + "text": " the merger of the Basketball Association of America (BAA) and the National", "type": "text" }, "event_type": { @@ -63691,7 +52073,7 @@ "data": { "event": { "delta": { - "text": "NBA", + "text": " Basketball League (NBL).", "type": "text" }, "event_type": { @@ -63711,7 +52093,108 @@ "data": { "event": { "delta": { - "text": " creation date", + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "OyfVMRgR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:53.322420+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", + "value": 63 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "OyfVMRgR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:53.322482+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", + "value": 45 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "OyfVMRgR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:53.322490+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", + "value": 108 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"when was the nba created?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[k", "type": "text" }, "event_type": { @@ -63731,7 +52214,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": "nowledge_search(query=\"NBA creation date\")]", "type": "text" }, "event_type": { @@ -63760,7 +52243,7 @@ "arguments": { "query": "NBA creation date" }, - "call_id": "bac8b49d-537e-4c73-bb9e-c06475903366", + "call_id": "388e55ab-448a-4a98-905b-196c051bdeea", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -63803,18 +52286,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "QpFMmy3B", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:52.235138+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 27 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "QpFMmy3B", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:52.235160+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 20 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "QpFMmy3B", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:52.235165+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 47 } ] diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index f52e34333d..8db8ad9661 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -5,7 +5,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stdout]\n541\n[/stdout]", "error_code": null, "error_message": null, "metadata": null @@ -25,26 +25,13 @@ } } }, - "[[], {\"kwargs\": {\"code\": \"def is_prime(n):\\n if n <= 1:\\n return False\\n if n == 2:\\n return True\\n if n % 2 == 0:\\n return False\\n max_divisor = int(n**0.5) + 1\\n for d in range(3, max_divisor, 2):\\n if n % d == 0:\\n return False\\n return True\\n\\ndef nth_prime(n):\\n count = 0\\n num = 2\\n while True:\\n if is_prime(num):\\n count += 1\\n if count == n:\\n return num\\n num += 1\\n\\nprint(nth_prime(100))\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { - "type": "value", - "value": { - "__module__": "llama_stack.apis.tools.tools", - "__pydantic__": "ToolInvocationResult", - "data": { - "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", - "error_code": null, - "error_message": null, - "metadata": null - } - } - }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stdout]\nNumber of rows and columns in the data: (10, 13)\nColumns of the data are: 13\nColumns of the data are: Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',\n 'Oct', 'Nov', 'Dec'],\n dtype='object')\nDatatype of the columns are: Year int64\nJan float64\nFeb float64\nMar float64\nApr float64\nMay float64\nJun float64\nJul float64\nAug float64\nSep float64\nOct float64\nNov float64\nDec float64\ndtype: object\n[/stdout]", "error_code": null, "error_message": null, "metadata": null @@ -77,26 +64,13 @@ } } }, - "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print the shape of the dataframe (number of rows and columns)\\nprint(df.shape)\\n\\n# Print the column names\\nprint(df.columns)\\n\\n# Print the data types of each column\\nprint(df.dtypes)\\n\\n# Print a summary of the dataframe (count, mean, std, min, 25%, 50%, 75%, max)\\nprint(df.describe())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { - "type": "value", - "value": { - "__module__": "llama_stack.apis.tools.tools", - "__pydantic__": "ToolInvocationResult", - "data": { - "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", - "error_code": null, - "error_message": null, - "metadata": null - } - } - }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stdout]\nYear Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec\n0 2014 1.6 1.6 1.7 1.8 2.0 1.9 1.9 1.7 1.7 1.8 1.7 1.6\n1 2015 1.6 1.7 1.8 1.8 1.7 1.8 1.8 1.8 1.9 1.9 2.0 2.1\n2 2016 2.2 2.3 2.2 2.1 2.2 2.2 2.2 2.3 2.2 2.1 2.1 2.2\n3 2017 2.3 2.2 2.0 1.9 1.7 1.7 1.7 1.7 1.7 1.8 1.7 1.8\n4 2018 1.8 1.8 2.1 2.1 2.2 2.3 2.4 2.2 2.2 2.1 2.2 2.2\n[/stdout]", "error_code": null, "error_message": null, "metadata": null @@ -109,7 +83,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 142, in \n line 23, in \n from .code_execution import CodeExecutionContext, CodeExecutionRequest, CodeExecutor\nImportError: attempted relative import with no known parent package\n[/stderr]", "error_code": null, "error_message": null, "metadata": null @@ -142,26 +116,26 @@ } } }, - "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n\\n# Calculate average yearly inflation\\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\\n\\n# Plot time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df['Year'], df['Average'])\\nplt.xlabel('Year')\\nplt.ylabel('Average Yearly Inflation')\\nplt.title('Average Yearly Inflation Over Time')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", + "content": "completed", "error_code": null, "error_message": null, "metadata": null } } }, - "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load data\\ndf = pd.read_csv(\\\"inflation.csv\\\")\\n\\n# Convert date column to datetime\\ndf['date'] = pd.to_datetime(df['date'])\\n\\n# Group by year and calculate average inflation\\naverage_inflation = df.groupby(df['date'].dt.year)['inflation'].mean()\\n\\n# Plot average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(average_inflation.index, average_inflation.values, marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Average Inflation')\\nplt.grid(True)\\nplt.show()\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", + "content": "completed\n[stderr]\nTraceback (most recent call last):\n line 5, in \n from bwrap.core import main\nModuleNotFoundError: No module named 'bwrap.core'\n[/stderr]", "error_code": null, "error_message": null, "metadata": null @@ -193,23 +167,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:961ff\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:15b86\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:961ff\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:961ff\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"}, \"tool_name\": \"web_search\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.70726365, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.467308, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.03678684, \"raw_content\": null}]}", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"query\": \"NBA creation date\", \"session_id\": \"\", \"vector_db_ids\": [\"test-vector-db-\"]}, \"tool_name\": \"knowledge_search\"}]": { "type": "value", "value": { @@ -413,23 +400,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:89553\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:83901\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:700ad\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:122a9\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:83901\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -441,11 +428,11 @@ "error_message": null, "metadata": { "document_ids": [ - "895539dd-a627-4c02-94d7-6591cd0ce00f", - "700ad5a6-e318-48ad-99b2-93934c5d7f8c", - "122a966a-6d33-4482-87a6-f5d16e9f92be", - "700ad5a6-e318-48ad-99b2-93934c5d7f8c", - "122a966a-6d33-4482-87a6-f5d16e9f92be" + "bbddbe62-508d-4c8d-9455-3b60bc2825a5", + "15b8638f-b1b6-4f58-adfa-eb6644c47de3", + "83901b53-33d4-4f5e-8145-b94c783e9f61", + "15b8638f-b1b6-4f58-adfa-eb6644c47de3", + "83901b53-33d4-4f5e-8145-b94c783e9f61" ] } } @@ -457,7 +444,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79080546, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05570498, \"raw_content\": null}]}", + "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\", \"url\": \"https://observer.com/2024/01/meta-facebook-top-executives/\", \"content\": \"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\", \"score\": 0.45536873, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", "error_code": null, "error_message": null, "metadata": null @@ -476,23 +463,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:700ad\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:20e5d\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:700ad\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:20e5d\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:700ad\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe Date: Wed, 12 Mar 2025 23:23:32 -0700 Subject: [PATCH 13/14] new mocks --- .../recorded_responses/chat_completion.json | 8715 ++++++++++++----- .../recorded_responses/invoke_tool.json | 55 +- 2 files changed, 6149 insertions(+), 2621 deletions(-) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 8694cc2713..80744576ea 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -39542,7 +39542,7 @@ "data": { "event": { "delta": { - "text": " provided function definitions", + "text": " provided function definitions are", "type": "text" }, "event_type": { @@ -39562,7 +39562,7 @@ "data": { "event": { "delta": { - "text": " are not suitable", + "text": " not suitable for this task", "type": "text" }, "event_type": { @@ -39582,7 +39582,7 @@ "data": { "event": { "delta": { - "text": " for this task. Please re", + "text": ". Please rework them", "type": "text" }, "event_type": { @@ -39602,27 +39602,7 @@ "data": { "event": { "delta": { - "text": "work them to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " align with the task requirements.", + "text": " to align with the task requirements.", "type": "text" }, "event_type": { @@ -39659,54 +39639,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "D2n_IS_8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021393+00:00", - "__module__": "datetime" - }, - "trace_id": "amAiZv5PQKSsA74j", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 90 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "D2n_IS_8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021420+00:00", - "__module__": "datetime" - }, - "trace_id": "amAiZv5PQKSsA74j", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 32 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "D2n_IS_8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:32.021427+00:00", - "__module__": "datetime" - }, - "trace_id": "amAiZv5PQKSsA74j", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 122 } ] @@ -40551,7 +40495,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": "get_boiling_point(liquid_name='polyjuice', celcius", "type": "text" }, "event_type": { @@ -40571,7 +40515,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": "=True)]", "type": "text" }, "event_type": { @@ -40601,7 +40545,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "3955f756-9aa0-433f-be8f-af8941c220de", + "call_id": "152ff1b5-7bcf-4d46-8c8f-e5c13f646925", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -40644,54 +40588,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "QZ6PSGpT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629456+00:00", - "__module__": "datetime" - }, - "trace_id": "M72bosg8TBe3uhx3", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 43 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "QZ6PSGpT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629488+00:00", - "__module__": "datetime" - }, - "trace_id": "M72bosg8TBe3uhx3", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "QZ6PSGpT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:03:29.629494+00:00", - "__module__": "datetime" - }, - "trace_id": "M72bosg8TBe3uhx3", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 71 } ] @@ -40748,7 +40656,7 @@ "data": { "event": { "delta": { - "text": " function call returned an", + "text": " function get_boiling_point was called with", "type": "text" }, "event_type": { @@ -40768,7 +40676,7 @@ "data": { "event": { "delta": { - "text": " error since", + "text": " the parameters liquid_name = \"polyjuice\" and", "type": "text" }, "event_type": { @@ -40788,7 +40696,7 @@ "data": { "event": { "delta": { - "text": " \"", + "text": " celcius = True. However, the function returned -", "type": "text" }, "event_type": { @@ -40808,7 +40716,7 @@ "data": { "event": { "delta": { - "text": "polyjuice\" is", + "text": "100, which is not a valid", "type": "text" }, "event_type": { @@ -40828,7 +40736,7 @@ "data": { "event": { "delta": { - "text": " not a real liquid. Polyju", + "text": " boiling point. This suggests that the function does not", "type": "text" }, "event_type": { @@ -40848,7 +40756,7 @@ "data": { "event": { "delta": { - "text": "ice is a fictional substance from the", + "text": " have the boiling point of \"poly", "type": "text" }, "event_type": { @@ -40868,7 +40776,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series. The boiling point", + "text": "juice\" in its database.", "type": "text" }, "event_type": { @@ -40888,33 +40796,58 @@ "data": { "event": { "delta": { - "text": " of a substance is a physical", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 84 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 73 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 157 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " property that can be measured and", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -40928,7 +40861,7 @@ "data": { "event": { "delta": { - "text": " quantified", + "text": "The", "type": "text" }, "event_type": { @@ -40948,7 +40881,7 @@ "data": { "event": { "delta": { - "text": ", but it only applies", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -40968,7 +40901,7 @@ "data": { "event": { "delta": { - "text": " to real substances that exist in the physical world.", + "text": " recognized.", "type": "text" }, "event_type": { @@ -41010,16 +40943,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "y9SHtJTQ", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411612+00:00", + "__datetime__": "2025-03-07T01:45:55.401637+00:00", "__module__": "datetime" }, - "trace_id": "_I2Cu85IRtOSBSX9", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 84 + "value": 93 }, { "attributes": { @@ -41027,16 +40960,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "y9SHtJTQ", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411644+00:00", + "__datetime__": "2025-03-07T01:45:55.401666+00:00", "__module__": "datetime" }, - "trace_id": "_I2Cu85IRtOSBSX9", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 73 + "value": 20 }, { "attributes": { @@ -41044,16 +40977,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "y9SHtJTQ", + "span_id": "Z7jBGJ-8", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:01.411650+00:00", + "__datetime__": "2025-03-07T01:45:55.401670+00:00", "__module__": "datetime" }, - "trace_id": "_I2Cu85IRtOSBSX9", + "trace_id": "WxMAq579Q-ixJ3wJ", "type": "metric", "unit": "tokens", - "value": 157 + "value": 113 } ] } @@ -41061,7 +40994,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -41109,7 +41042,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": " function call should have been", "type": "text" }, "event_type": { @@ -41129,7 +41062,7 @@ "data": { "event": { "delta": { - "text": " recognized.", + "text": " \n[get_boiling_point_with", "type": "text" }, "event_type": { @@ -41149,94 +41082,53 @@ "data": { "event": { "delta": { - "text": "", + "text": "_metadata(liquid_name='polyjuice', celci", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "Z7jBGJ-8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401637+00:00", - "__module__": "datetime" - }, - "trace_id": "WxMAq579Q-ixJ3wJ", - "type": "metric", - "unit": "tokens", - "value": 93 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "us=True)] \nHowever since the", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "Z7jBGJ-8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401666+00:00", - "__module__": "datetime" - }, - "trace_id": "WxMAq579Q-ixJ3wJ", - "type": "metric", - "unit": "tokens", - "value": 20 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "Z7jBGJ-8", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401670+00:00", - "__module__": "datetime" - }, - "trace_id": "WxMAq579Q-ixJ3wJ", - "type": "metric", - "unit": "tokens", - "value": 113 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " function get_boiling_point_with_metadata does not", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -41250,7 +41142,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " exist in the ipython environment,", "type": "text" }, "event_type": { @@ -41270,7 +41162,7 @@ "data": { "event": { "delta": { - "text": " function get_bo", + "text": " it will throw an", "type": "text" }, "event_type": { @@ -41290,7 +41182,7 @@ "data": { "event": { "delta": { - "text": "iling_point_with_metadata does not exist,", + "text": " error. \n\nIn order to", "type": "text" }, "event_type": { @@ -41310,7 +41202,7 @@ "data": { "event": { "delta": { - "text": " I will", + "text": " get the correct answer, you would need to define the function", "type": "text" }, "event_type": { @@ -41330,7 +41222,7 @@ "data": { "event": { "delta": { - "text": " assume you", + "text": " get_boiling_point_with_metadata first. \n\nHere is how you", "type": "text" }, "event_type": { @@ -41350,7 +41242,7 @@ "data": { "event": { "delta": { - "text": " meant get_bo", + "text": " can define it:\n\n```python\ndef get_boiling", "type": "text" }, "event_type": { @@ -41370,7 +41262,7 @@ "data": { "event": { "delta": { - "text": "iling_point_with_metadata", + "text": "_point_with_metadata(liquid_name, celcius=True):\n", "type": "text" }, "event_type": { @@ -41390,7 +41282,7 @@ "data": { "event": { "delta": { - "text": ". The boiling point of polyjuice", + "text": " # This is a mock implementation,", "type": "text" }, "event_type": { @@ -41410,7 +41302,7 @@ "data": { "event": { "delta": { - "text": " is -100.", + "text": " you would need a real", "type": "text" }, "event_type": { @@ -41430,94 +41322,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " database of boiling points\n boiling_points", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "8dM6i5mO", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329281+00:00", - "__module__": "datetime" - }, - "trace_id": "zMJDP5dXRrChi7uE", - "type": "metric", - "unit": "tokens", - "value": 86 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "8dM6i5mO", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329312+00:00", - "__module__": "datetime" - }, - "trace_id": "zMJDP5dXRrChi7uE", - "type": "metric", - "unit": "tokens", - "value": 45 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "8dM6i5mO", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:03.329318+00:00", - "__module__": "datetime" - }, - "trace_id": "zMJDP5dXRrChi7uE", - "type": "metric", - "unit": "tokens", - "value": 131 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " = {\n \"water\": 100,\n \"polyjuice\":", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -41531,7 +41362,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " 120 #", "type": "text" }, "event_type": { @@ -41551,7 +41382,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point_with_metadata(", + "text": " This is fictional, polyjuice is not a", "type": "text" }, "event_type": { @@ -41571,7 +41402,7 @@ "data": { "event": { "delta": { - "text": "liquid_name=\"polyjuice\", celcius=True) should be", + "text": " real liquid\n", "type": "text" }, "event_type": { @@ -41591,7 +41422,7 @@ "data": { "event": { "delta": { - "text": " used to get the answer.", + "text": " }\n \n if liquid_name in boiling", "type": "text" }, "event_type": { @@ -41611,94 +41442,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "_points:\n if celcius:\n return boiling_points[liquid_name", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "pzQMKAJc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809816+00:00", - "__module__": "datetime" - }, - "trace_id": "018KkGcOThSSiZfE", - "type": "metric", - "unit": "tokens", - "value": 97 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "pzQMKAJc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809911+00:00", - "__module__": "datetime" - }, - "trace_id": "018KkGcOThSSiZfE", - "type": "metric", - "unit": "tokens", - "value": 39 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "pzQMKAJc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809922+00:00", - "__module__": "datetime" - }, - "trace_id": "018KkGcOThSSiZfE", - "type": "metric", - "unit": "tokens", - "value": 136 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "]\n else:\n return boiling_points[liquid_name] *", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -41712,7 +41482,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": " 9/5 + 32\n else:\n", "type": "text" }, "event_type": { @@ -41732,7 +41502,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": " return \"Boiling point not found\"\n```\n\nThen you", "type": "text" }, "event_type": { @@ -41752,7 +41522,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": " can call the function:\n\n```python\nprint(get_bo", "type": "text" }, "event_type": { @@ -41772,20 +41542,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "328cb19d-47bb-47cc-8258-a5ca2e26803e", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": "iling_point_with_metadata(\"polyjuice\"))\n```", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -41793,11 +41551,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -41825,55 +41579,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "dS0bhfN_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324788+00:00", - "__module__": "datetime" - }, - "trace_id": "UJz5Cas1SDyQYeBk", - "type": "metric", - "unit": "tokens", - "value": 37 + "unit": null, + "value": 86 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "dS0bhfN_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324835+00:00", - "__module__": "datetime" - }, - "trace_id": "UJz5Cas1SDyQYeBk", - "type": "metric", - "unit": "tokens", - "value": 28 + "unit": null, + "value": 250 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "dS0bhfN_", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:53.324844+00:00", - "__module__": "datetime" - }, - "trace_id": "UJz5Cas1SDyQYeBk", - "type": "metric", - "unit": "tokens", - "value": 65 + "unit": null, + "value": 336 } ] } @@ -41881,7 +41599,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -41909,7 +41627,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "The", "type": "text" }, "event_type": { @@ -41929,7 +41647,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point_with_metadata", + "text": " function get_boiling_point_with_metadata(", "type": "text" }, "event_type": { @@ -41949,7 +41667,7 @@ "data": { "event": { "delta": { - "text": "(liquid_name='polyjuice', cel", + "text": "liquid_name=\"polyjuice\", celcius=True) should be", "type": "text" }, "event_type": { @@ -41969,7 +41687,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": " used to get the answer.", "type": "text" }, "event_type": { @@ -41983,42 +41701,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "5bb48d00-7d5c-49e2-bddf-e5fdc5f35485", - "tool_name": "get_boiling_point_with_metadata" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -42047,16 +41729,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "mfrFN7m2", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136501+00:00", + "__datetime__": "2025-03-07T01:45:56.809816+00:00", "__module__": "datetime" }, - "trace_id": "T4eddr4-SMWPQwKA", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 37 + "value": 97 }, { "attributes": { @@ -42064,16 +41746,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "mfrFN7m2", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136529+00:00", + "__datetime__": "2025-03-07T01:45:56.809911+00:00", "__module__": "datetime" }, - "trace_id": "T4eddr4-SMWPQwKA", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 30 + "value": 39 }, { "attributes": { @@ -42081,16 +41763,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "mfrFN7m2", + "span_id": "pzQMKAJc", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:05:02.136535+00:00", + "__datetime__": "2025-03-07T01:45:56.809922+00:00", "__module__": "datetime" }, - "trace_id": "T4eddr4-SMWPQwKA", + "trace_id": "018KkGcOThSSiZfE", "type": "metric", "unit": "tokens", - "value": 67 + "value": 136 } ] } @@ -42098,7 +41780,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -42126,7 +41808,7 @@ "data": { "event": { "delta": { - "text": "When", + "text": "[", "type": "text" }, "event_type": { @@ -42146,7 +41828,7 @@ "data": { "event": { "delta": { - "text": " I answered the", + "text": "get_boiling_point(liquid_name='polyjuice', cel", "type": "text" }, "event_type": { @@ -42166,7 +41848,7 @@ "data": { "event": { "delta": { - "text": " phone, the friendly", + "text": "cius=True)]", "type": "text" }, "event_type": { @@ -42186,16 +41868,4792 @@ "data": { "event": { "delta": { - "text": " voice on the other end said \"hello\"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "2dece34e-68c3-43a9-b685-e229569135ab", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 28 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 65 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "[", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "get_boiling_point_with_metadata(liquid_name='poly", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "juice', celcius=True)]", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "24730d85-818c-4719-8a28-903160341849", + "tool_name": "get_boiling_point_with_metadata" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 67 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "When", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " I answered the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " phone, the friendly voice on the other end said", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " \"hello\" and asked how I was doing", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 30 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 34 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 64 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " am not able", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to execute this task as", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it exceeds the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " limitations of the functions I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " have been given.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 433 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 31 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 464 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file 'bwrap' was not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " found. This is likely because the file path provided", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is incorrect or the file does not exist in", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the current working directory.\n\nTo resolve this", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue, you can try the following:\n\n1. Check", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file path: Ensure that the file path provided is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " correct and the file exists in the specified location.\n2. Use the correct", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file path: If the file is located in a different directory,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " provide the correct file path.\n3. Check the file name: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file name is correct and matches the one provided", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " in the code.\n4. Use the absolute", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file path: Instead of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " using a relative file path, try using the absolute file path to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file.\n\nIf you are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " still encountering issues, please provide more details about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file and its location, and I'll be happy to assist you further", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 238 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 183 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 421 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/folders/rb/qvqvwgyj6yjd3", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "t4pwsy9t0rm0000gn/T/tmpcf_0q3u", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "1/q2GJtbK2inflation.csv\")\n# Rows\nprint(\"Number of rows", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n#", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qvqvwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpcf_0q3u1/q2GJtbK2inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "e57e8c8f-b440-46c6-aa6e-3d0e743e2fd9", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 37 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 47 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vwgyj6yjd3t4pwsy9t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm0000gn/T/tmp2x_sml66/9vY", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vmVRoinflation.csv\" does not exist. This could be due to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " a variety of reasons such as the file being deleted, the path being incorrect", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " try the following:\n\n1. Check the file path: Ensure that the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " path is correct and the file exists at that location.\n2. Check file permissions:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Ensure that the file is accessible and you have the necessary permissions to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " read it.\n3. Try a different file: If the file is not", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " accessible, try loading a different file to see if the issue is specific to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " this file or a general issue with your code.\n4. Check for ty", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "pos: Ensure that there are no typos in the file path or the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you are using, and I'll be happy to help further.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262530+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 680 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262555+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 238 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262558+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 918 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6yjd3t4pwsy9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'], format='%Y')\n\n# Group by", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation as a time series\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Inflation'], marker='o')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953806+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 432 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953843+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 10 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qQY5sAli", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:21.953847+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 442 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not available, try using a different file or creating a sample file to test the code.\\n\\nIf you are still having issues, please provide more details about the file and the environment you are running the code in, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vwgyj6yjd3t4pwsy9t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm0000gn/T/tmpk42nx919/quNW0bvlin", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "flation.csv\" does not exist. This could be due to a number of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons such as the file being deleted, the path being incorrect, or the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " file not being accessible.\n\nTo resolve this issue,", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you can try the following:\n\n1. Check the file path: Make sure", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file path is correct and the file exists at that location.\n2.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " Check file permissions: Ensure that the file is accessible and you have the necessary", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " permissions to read it.\n3. Try a different file: If the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " is not available, try using a different file or creating a sample file to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " test the code.\n\nIf you are still having issues, please provide more details", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " about the file and the environment you are running the code in, and I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'ll be happy to help further.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 644 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 208 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 852 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not available, try using a different file or creating a sample file to test the code.\\n\\nIf you are still having issues, please provide more details about the file and the environment you are running the code in, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6yjd3t4pwsy9t", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000gn/T/tmpk42nx919/quNW0bv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "linflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "['Year'] = pd.to_datetime(df['Year'], format", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "='%Y')\n\n# Group by 'Year' and calculate", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " the average inflation\ndf_avg_inflation = df.groupby('", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " inflation as a time series\nplt.figure(figsize=(10", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ",6))\nplt.plot(df_avg_inflation['Year'], df_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Inflation'], marker='o')\nplt.title('Average Yearly", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpk42nx919/quNW0bvlinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "dcc87d01-280c-4aef-9d8a-4295ddaff6c6", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 399 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 409 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the user running the script has read permissions for the file.\\n3. Use a try-except block: You can use a try-except block to catch the FileNotFoundError and handle it accordingly.\\n\\nHere's an example of how you can modify the code to handle the error:\\n\\n```\\nimport pandas as pd\\n\\ntry:\\n df = pd.read_csv(\\\"\")\\n print(df.head())\\n print(df.info())\\n print(df.describe())\\nexcept FileNotFoundError:\\n print(\\\"The file does not exist\\\")\\n```\\n\\nThis code will print \\\"The file does not exist\\\" if the file is not found, instead of raising an error.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "vwgyj6yjd3t4pwsy9t0", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "rm0000gn/T/tmpaxhbg_k1/YkD5", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "cgQZinflation.csv\" does not exist. This could be due", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " to a number of reasons, such as the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being deleted or moved, or the path being incorrect.\n\nTo resolve this issue", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ", you can try the following:\n\n1. Check the file path: Make", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " sure the file path is correct and the file exists at that location.\n2", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Check file permissions: Ensure that the user running the script has read permissions", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " for the file.\n3", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ". Use a try-except block: You can", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " use a try-except block to catch the FileNotFoundError", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and handle it accordingly.\n\nHere's an example of how you can modify the", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " code to handle the error:\n\n```\nimport pandas as pd\nimport matplotlib", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".pyplot as plt\n\ntry:\n df = pd.read_csv(\"/var", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "/folders/rb/qv8vwgyj6yjd3t", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4pwsy9t0rm0000", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "gn/T/tmpaxhbg_k1/YkD5cgQZ", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "inflation.csv\")\n df['Year'] = pd.to_datetime(df['", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Year'], format='%Y')\n df_avg_inflation = df.groupby('", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Year')['Inflation'].mean().reset_index()\n plt.figure(figsize=(", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "10,6))\n plt.plot(df_avg_inflation['Year'], df", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "_avg_inflation['Inflation'], marker='o')\n plt.title('", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "Average Yearly Inflation')\n plt.xlabel('Year')\n plt.ylabel", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "('Inflation')\n plt.grid(True)\n plt.show()\nexcept FileNotFoundError", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ":\n print(\"The file does not exist\")\n```\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "This code will print \"The file does not exist\" if the file is", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not found, instead of raising an error.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 745 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 391 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 1136 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the user running the script has read permissions for the file.\\n3. Use a try-except block: You can use a try-except block to catch the FileNotFoundError and handle it accordingly.\\n\\nHere's an example of how you can modify the code to handle the error:\\n\\n```\\nimport pandas as pd\\n\\ntry:\\n df = pd.read_csv(\\\"\")\\n print(df.head())\\n print(df.info())\\n print(df.describe())\\nexcept FileNotFoundError:\\n print(\\\"The file does not exist\\\")\\n```\\n\\nThis code will print \\\"The file does not exist\\\" if the file is not found, instead of raising an error.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6yjd3t4", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "pwsy9t0rm0000gn", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/T/tmpaxhbg_k1/YkD5cgQZin", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.csv\")\n\n# Convert the 'Year' column", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " to datetime\ndf['Year'] = pd.to_datetime(df['Year'],", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " format='%Y')\n\n# Group by", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation as a time series\n", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df_avg_inflation['Inflation'], marker='o')\nplt", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpaxhbg_k1/YkD5cgQZinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "947831c7-8279-4f5b-8cfd-c30697382127", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 496 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 10 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 506 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " due to a variety of", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " reasons such as the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " being deleted, the path being incorrect, or the file", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " not being accessible.\n\nTo resolve this issue, you can try", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the following:\n\n1. Check the file path: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file path is correct and the file exists at that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " location.\n2. Check file permissions: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file is accessible and", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " you have the necessary permissions to read", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " it.\n3. Try a different file: If", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file is not accessible, try loading a different file to see", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " if the issue is specific to this file or a general", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " issue with your code.\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "4. Check for typos: Ensure that", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " there are no typos in the file path or the code.\n\n", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "If you are", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " still having issues, please provide more details about", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the file and the code you are using", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -42206,7 +46664,7 @@ "data": { "event": { "delta": { - "text": " and asked how I was doing.", + "text": ", and I'll be happy to help further.", "type": "text" }, "event_type": { @@ -42248,16 +46706,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "tJEuRhla", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044284+00:00", + "__datetime__": "2025-03-07T01:44:19.630894+00:00", "__module__": "datetime" }, - "trace_id": "bnDS7Z41TRO0UyfH", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 30 + "value": 192 }, { "attributes": { @@ -42265,55 +46723,115 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "tJEuRhla", + "span_id": "KwfNrQLy", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:19.630987+00:00", + "__module__": "datetime" + }, + "trace_id": "kNsljyzfQV2Cn4aZ", + "type": "metric", + "unit": "tokens", + "value": 238 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "KwfNrQLy", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044312+00:00", + "__datetime__": "2025-03-07T01:44:19.630996+00:00", "__module__": "datetime" }, - "trace_id": "bnDS7Z41TRO0UyfH", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 34 + "value": 430 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "start" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "The", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " error message indicates that the file \"/var/folders/rb/qv8", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "tJEuRhla", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:01.044318+00:00", - "__module__": "datetime" - }, - "trace_id": "bnDS7Z41TRO0UyfH", - "type": "metric", - "unit": "tokens", - "value": 64 - } - ] + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "vwgyj6yjd3t4pwsy9t0", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -42327,7 +46845,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "rm0000gn/T/tmpaxhbg_k1/YkD5", "type": "text" }, "event_type": { @@ -42347,7 +46865,7 @@ "data": { "event": { "delta": { - "text": " am not able", + "text": "cgQZinflation.csv\" does not exist. This could be", "type": "text" }, "event_type": { @@ -42367,7 +46885,7 @@ "data": { "event": { "delta": { - "text": " to execute this task as", + "text": " due to a number of reasons, such as the file being deleted or moved", "type": "text" }, "event_type": { @@ -42387,7 +46905,7 @@ "data": { "event": { "delta": { - "text": " it exceeds the", + "text": ", or the path being incorrect.\n\nTo resolve this", "type": "text" }, "event_type": { @@ -42407,7 +46925,7 @@ "data": { "event": { "delta": { - "text": " limitations of the functions I", + "text": " issue, you can try the following:\n\n1. Check the file path:", "type": "text" }, "event_type": { @@ -42427,7 +46945,7 @@ "data": { "event": { "delta": { - "text": " have been given.", + "text": " Make sure the file path is correct and the file exists at that", "type": "text" }, "event_type": { @@ -42447,94 +46965,53 @@ "data": { "event": { "delta": { - "text": "", + "text": " location.\n2. Check file permissions", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "5If5go-q", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070675+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 433 + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ": Ensure that the user running the script has read permissions", + "type": "text" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "5If5go-q", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070742+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 31 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "5If5go-q", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070750+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 464 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " for the file.\n3. Use a", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -42548,13 +47025,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": " try-except block: You can use a try-except", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42573,13 +47045,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n# Load data\ndf =", - "type": "tool_call" + "text": " block to catch the FileNotFoundError and handle it accordingly.\n\nHere's an", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42598,13 +47065,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " pd.read_csv(\"/var/folders/rb/qv8vwgyj", - "type": "tool_call" + "text": " example of how you can modify the code to handle the error", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42623,13 +47085,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "6yjd3t4pwsy9t0rm0000", - "type": "tool_call" + "text": ":\n\n```\nimport pandas as pd\n\ntry:\n", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42648,13 +47105,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "gn/T/tmp2x_sml66/ZEjbinQHin", - "type": "tool_call" + "text": " df = pd.read_csv(\"/var/folders/rb/q", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42673,13 +47125,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation.csv\")\n# Rows\nprint(\"Number of rows and columns in the", - "type": "tool_call" + "text": "v8vwgyj6yjd3t4pwsy9", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42698,13 +47145,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", - "type": "tool_call" + "text": "t0rm0000gn/T/tmpaxhbg_k1/Yk", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42723,13 +47165,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data", - "type": "tool_call" + "text": "D5cgQZinflation.csv\")\n print(df.head())\n print(df.info())\n ", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42748,13 +47185,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df", - "type": "tool_call" + "text": " print(df.describe())\nexcept FileNotFoundError:\n print(\"The file does not exist", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42773,13 +47205,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\n", - "type": "tool_call" + "text": "\")\n```\n\nThis code will print \"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42798,13 +47225,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "print(df.head())", - "type": "tool_call" + "text": "The file does not exist\" if the file is not", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42823,23 +47245,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" - }, - "call_id": "1df8b196-9eff-4b06-97e7-ab175c741e8f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " found, instead of raising an error.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42847,11 +47254,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -42879,55 +47282,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "fLqIbpek", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262304+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 235 + "unit": null, + "value": 193 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "fLqIbpek", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262340+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 10 + "unit": null, + "value": 301 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "fLqIbpek", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:40.262347+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 245 + "unit": null, + "value": 494 } ] } @@ -42935,7 +47302,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -42993,7 +47360,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "import pandas as pd\n# Load data\ndf = pd", + "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", "type": "tool_call" }, "event_type": { @@ -43018,7 +47385,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ".read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4", + "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", "type": "tool_call" }, "event_type": { @@ -43043,7 +47410,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "pwsy9t0rm0000gn/T/tmp2x_sml66/ZEj", + "tool_call": "jd3t4pwsy9t0rm0000gn/T", "type": "tool_call" }, "event_type": { @@ -43068,7 +47435,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "binQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data", + "tool_call": "/tmpaxhbg_k1/YkD5cgQZinflation", "type": "tool_call" }, "event_type": { @@ -43093,7 +47460,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n#", + "tool_call": ".csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head", "type": "tool_call" }, "event_type": { @@ -43118,7 +47485,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint", + "tool_call": "())\n\n# Print information about", "type": "tool_call" }, "event_type": { @@ -43143,7 +47510,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "(\"Datatype of the columns are:\", df.dtypes)\n# Sample", + "tool_call": " the dataframe\nprint(df", "type": "tool_call" }, "event_type": { @@ -43168,7 +47535,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " of data\nprint(\"Data sample from file:\")\nprint(df.head())", + "tool_call": ".info())\n\n# Print summary statistics of the dataframe\nprint(df.describe())", "type": "tool_call" }, "event_type": { @@ -43195,9 +47562,9 @@ }, "tool_call": { "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpaxhbg_k1/YkD5cgQZinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics of the dataframe\nprint(df.describe())" }, - "call_id": "c1708ded-f272-4008-b91f-19d61780c394", + "call_id": "ae5e788f-1422-4f12-b465-a29043709fbb", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -43244,55 +47611,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "KTMayjIE", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:37.305765+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 37 + "unit": null, + "value": 36 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "KTMayjIE", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:37.305820+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 10 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "KTMayjIE", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:37.305832+00:00", - "__module__": "datetime" - }, - "trace_id": "StUjhrTMQKKQSRvS", - "type": "metric", - "unit": "tokens", - "value": 47 + "unit": null, + "value": 46 } ] } @@ -43300,7 +47631,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:3e31d\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:90a49\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:14c27\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " also experiment with different LoRA configurations, such as", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -43749,13 +48039,88 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": " applying Lo", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "RA to all linear layers in the self-attention, increasing", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the rank, and scaling", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " alpha and rank together.\n\nNote:", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " You need to have the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43774,13 +48139,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" + "text": " pre-trained Llama2 weights and tokenizer downloaded and installed before", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43799,13 +48159,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" + "text": " running the LoRA finetune. Additionally, you can use", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43824,13 +48179,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "8vwgyj6yjd3t4pwsy9t", - "type": "tool_call" + "text": " torcht", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43849,13 +48199,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", - "type": "tool_call" + "text": "une's `WandBLogger` to generate loss curves and track your", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43874,13 +48219,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", - "type": "tool_call" + "text": " experiments.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43899,43 +48239,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 146 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 296 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 442 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:3e31d\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:90a49\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'], format='%Y')\n\n# Group by", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -43949,13 +48304,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", - "type": "tool_call" + "text": "[k", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43974,13 +48324,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", - "type": "tool_call" + "text": "nowledge_search(query=\"using LoRA in Torchtune", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43999,13 +48344,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation as a time series\n", - "type": "tool_call" + "text": "\")]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -44027,9 +48367,15 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "0e2bf2f6-0915-4d44-ad3b-6fd48d3df717", + "tool_name": "knowledge_search" }, - "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", "type": "tool_call" }, "event_type": { @@ -44038,7 +48384,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -44049,43 +48399,58 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'], df_avg_in", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 107 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 130 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:3e31d\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:90a49\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['Inflation'], marker='o')\nplt", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -44099,13 +48464,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", - "type": "tool_call" + "text": "I", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -44124,13 +48484,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", - "type": "tool_call" + "text": "'m ready to help. What's", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -44149,23 +48504,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " your question about Torchtune?", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -44173,11 +48513,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -44198,62 +48534,26 @@ }, "logprobs": null, "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953806+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 432 + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 75 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953843+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 10 + "unit": null, + "value": 25 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953847+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 442 + "unit": null, + "value": 100 } ] } @@ -44261,7 +48561,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -44850,13 +49170,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "[k", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -44875,13 +49190,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", - "type": "tool_call" + "text": "nowledge_search(query=\"using LoRA in Torchtune", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -44900,13 +49210,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", - "type": "tool_call" + "text": "\")]", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -44928,9 +49233,15 @@ "parse_status": { "__enum__": "ToolCallParseStatus", "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", + "tool_name": "knowledge_search" }, - "tool_call": "jd3t4pwsy9t0rm0000gn/T", "type": "tool_call" }, "event_type": { @@ -44939,7 +49250,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -44950,68 +49265,94 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmp2x_sml66/9vYvmVRoinflation.csv", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" }, - "tool_call": "\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n", - "type": "tool_call" + "metric": "prompt_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673350+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 107 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673375+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 23 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673381+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 130 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Print information about the dataframe\nprint(df", - "type": "tool_call" + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -45025,13 +49366,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".info())\n\n# Print summary statistics about the dataframe\nprint(df.describe", - "type": "tool_call" + "text": "I", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -45050,13 +49386,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "())", - "type": "tool_call" + "text": "'m ready to help. What's", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -45075,23 +49406,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())" - }, - "call_id": "5bbfebeb-4360-4ef9-a9e2-4227a8e8c699", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " your question about Torchtune?", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -45099,11 +49415,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -45136,16 +49448,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "AyEX3So6", + "span_id": "7n3WMt3R", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873486+00:00", + "__datetime__": "2025-03-07T01:45:31.179269+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 36 + "value": 75 }, { "attributes": { @@ -45153,16 +49465,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "AyEX3So6", + "span_id": "7n3WMt3R", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873500+00:00", + "__datetime__": "2025-03-07T01:45:31.179301+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 10 + "value": 25 }, { "attributes": { @@ -45170,16 +49482,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "AyEX3So6", + "span_id": "7n3WMt3R", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:17.873503+00:00", + "__datetime__": "2025-03-07T01:45:31.179308+00:00", "__module__": "datetime" }, - "trace_id": "kNsljyzfQV2Cn4aZ", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 46 + "value": 100 } ] } @@ -45187,7 +49499,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:57768\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6e83e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:4fd22\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:57768\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6e83e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -45865,7 +50221,7 @@ "arguments": { "query": "using LoRA in Torchtune" }, - "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", + "call_id": "b21a18ba-4bb8-4f64-8ae9-acb263ba0654", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -45908,54 +50264,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673350+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 107 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673375+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", + "metric": "completion_tokens", + "unit": null, "value": 23 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "vGtNmXNY", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673381+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 130 } ] @@ -45964,7 +50284,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:57768\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6e83e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -46069,54 +50389,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179269+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 75 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179301+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 25 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "7n3WMt3R", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:31.179308+00:00", - "__module__": "datetime" - }, - "trace_id": "BLgI_VzNTCCRs_2T", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 100 } ] @@ -46732,209 +51016,13 @@ "span_id": "IZ8Q_jX_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:28.484922+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 437 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209198+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 108 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209239+00:00", - "__module__": "datetime" - }, - "trace_id": "7GQeegpgTI-gqjHp", - "type": "metric", - "unit": "tokens", - "value": 23 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "qLPBZlok", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209247+00:00", + "__datetime__": "2025-03-07T01:45:28.484922+00:00", "__module__": "datetime" }, "trace_id": "7GQeegpgTI-gqjHp", "type": "metric", "unit": "tokens", - "value": 131 + "value": 437 } ] } @@ -46942,7 +51030,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -46970,7 +51058,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "[k", "type": "text" }, "event_type": { @@ -46990,7 +51078,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What's", + "text": "nowledge_search(query=\"using LoRA in Torchtune", "type": "text" }, "event_type": { @@ -47010,7 +51098,7 @@ "data": { "event": { "delta": { - "text": " your first question about Torchtune", + "text": "\")]", "type": "text" }, "event_type": { @@ -47030,8 +51118,19 @@ "data": { "event": { "delta": { - "text": "?", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -47039,7 +51138,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -47072,16 +51175,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "mYTkxvK_", + "span_id": "qLPBZlok", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525734+00:00", + "__datetime__": "2025-03-07T01:45:26.209198+00:00", "__module__": "datetime" }, - "trace_id": "kpcdkZQ2SsSOh9Lw", + "trace_id": "7GQeegpgTI-gqjHp", "type": "metric", "unit": "tokens", - "value": 75 + "value": 108 }, { "attributes": { @@ -47089,16 +51192,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "mYTkxvK_", + "span_id": "qLPBZlok", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525763+00:00", + "__datetime__": "2025-03-07T01:45:26.209239+00:00", "__module__": "datetime" }, - "trace_id": "kpcdkZQ2SsSOh9Lw", + "trace_id": "7GQeegpgTI-gqjHp", "type": "metric", "unit": "tokens", - "value": 26 + "value": 23 }, { "attributes": { @@ -47106,16 +51209,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "mYTkxvK_", + "span_id": "qLPBZlok", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525770+00:00", + "__datetime__": "2025-03-07T01:45:26.209247+00:00", "__module__": "datetime" }, - "trace_id": "kpcdkZQ2SsSOh9Lw", + "trace_id": "7GQeegpgTI-gqjHp", "type": "metric", "unit": "tokens", - "value": 101 + "value": 131 } ] } @@ -47123,7 +51226,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -47151,7 +51254,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "I", "type": "text" }, "event_type": { @@ -47171,7 +51274,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Tor", + "text": "'m ready to help. What's", "type": "text" }, "event_type": { @@ -47191,7 +51294,7 @@ "data": { "event": { "delta": { - "text": "chtune documentation\")]", + "text": " your first question about Torchtune", "type": "text" }, "event_type": { @@ -47211,19 +51314,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Torchtune documentation" - }, - "call_id": "385cbde8-19e8-4c8b-84ca-b75050b3666b", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": "?", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -47231,11 +51323,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -47268,16 +51356,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "-7YS2sLl", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668846+00:00", + "__datetime__": "2025-03-07T01:45:23.525734+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 39 + "value": 75 }, { "attributes": { @@ -47285,16 +51373,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "-7YS2sLl", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668859+00:00", + "__datetime__": "2025-03-07T01:45:23.525763+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 20 + "value": 26 }, { "attributes": { @@ -47302,16 +51390,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "-7YS2sLl", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:30.668861+00:00", + "__datetime__": "2025-03-07T01:45:23.525770+00:00", "__module__": "datetime" }, - "trace_id": "BLgI_VzNTCCRs_2T", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 59 + "value": 101 } ] } @@ -47319,7 +51407,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -47347,7 +51435,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": "[k", "type": "text" }, "event_type": { @@ -47367,7 +51455,7 @@ "data": { "event": { "delta": { - "text": "lama3-8B uses grouped-query", + "text": "nowledge", "type": "text" }, "event_type": { @@ -47387,7 +51475,7 @@ "data": { "event": { "delta": { - "text": " attention instead of", + "text": "_search(query=\"Torchtune documentation\")]", "type": "text" }, "event_type": { @@ -47407,8 +51495,19 @@ "data": { "event": { "delta": { - "text": " the standard multi-head attention.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Torchtune documentation" + }, + "call_id": "2ecaa92d-e752-498b-86d0-f6ee5c8b3131", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -47416,7 +51515,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -47444,55 +51547,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.982970+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 80 + "unit": null, + "value": 39 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.983000+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 28 + "unit": null, + "value": 20 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "1eIEdjPP", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:18.983005+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 108 + "unit": null, + "value": 59 } ] } @@ -47500,7 +51567,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -47548,27 +51615,7 @@ "data": { "event": { "delta": { - "text": "lama3-8B uses grouped-query attention instead of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the standard", + "text": "lama3-8B uses grouped-query", "type": "text" }, "event_type": { @@ -47588,7 +51635,7 @@ "data": { "event": { "delta": { - "text": " multi-head attention.", + "text": " attention instead of the standard multi-head attention.", "type": "text" }, "event_type": { @@ -47625,54 +51672,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884663+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 80 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884753+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "SlTnlfYc", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.884760+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 108 } ] @@ -47681,7 +51692,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -47709,27 +51720,7 @@ "data": { "event": { "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"Llama3-8", + "text": "L", "type": "text" }, "event_type": { @@ -47749,7 +51740,7 @@ "data": { "event": { "delta": { - "text": "B attention type\")]", + "text": "lama3-8B uses grouped-query", "type": "text" }, "event_type": { @@ -47769,19 +51760,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "4901bbdf-8faf-4a57-b6f6-01688c6290e6", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " attention instead of the standard multi-head attention.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -47789,11 +51769,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -47821,55 +51797,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412559+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 40 + "unit": null, + "value": 80 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412607+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 24 + "unit": null, + "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "DBPomV08", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:15.412615+00:00", - "__module__": "datetime" - }, - "trace_id": "rNeuYcnxTSqrP6Dg", - "type": "metric", - "unit": "tokens", - "value": 64 + "unit": null, + "value": 108 } ] } @@ -47877,7 +51817,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -47925,7 +51865,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Llama3-8B attention", + "text": "nowledge_search(query=\"Llama3-8", "type": "text" }, "event_type": { @@ -47945,7 +51885,7 @@ "data": { "event": { "delta": { - "text": " type\")]", + "text": "B attention type\")]", "type": "text" }, "event_type": { @@ -47974,7 +51914,7 @@ "arguments": { "query": "Llama3-8B attention type" }, - "call_id": "dd056386-b105-47e5-bd85-07e5ae096de1", + "call_id": "13953b92-bce0-463c-90ce-b2d9cca61e64", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -48017,54 +51957,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "yjKrmpeo", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041566+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 40 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "yjKrmpeo", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041591+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 24 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "yjKrmpeo", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:12.041597+00:00", - "__module__": "datetime" - }, - "trace_id": "liTx9auyTkyfvrBr", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 64 } ] @@ -48073,7 +51977,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -48101,7 +52005,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[k", "type": "text" }, "event_type": { @@ -48121,7 +52025,27 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", + "text": "nowledge_search(query=\"Llama3-8B attention", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " type\")]", "type": "text" }, "event_type": { @@ -48135,6 +52059,41 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "0d27e59e-72dd-4976-8049-85a3e533e350", + "tool_name": "knowledge_search" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -48158,55 +52117,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084924+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 1145 + "unit": null, + "value": 40 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084934+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 19 + "unit": null, + "value": 24 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "oB7hDf6E", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084936+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 1164 + "unit": null, + "value": 64 } ] } @@ -48214,7 +52137,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79080546, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05570498, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -48242,73 +52165,28 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "brave_search.call(query=\"current CEO of Meta\")", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", "value": "progress" }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "current CEO of Meta" - }, - "call_id": "535c272b-768b-44fe-b303-2eae022f67f5", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "brave_search" - } - }, - "type": "tool_call" + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " current CEO of Meta is Mark Zuckerberg.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -48316,11 +52194,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -48348,55 +52222,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "AZ60Ocso", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907918+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 34 + "unit": null, + "value": 1235 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "AZ60Ocso", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907933+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 10 + "unit": null, + "value": 19 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "AZ60Ocso", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:03.907936+00:00", - "__module__": "datetime" - }, - "trace_id": "hwA8OLUhQ1qa3ecF", - "type": "metric", - "unit": "tokens", - "value": 44 + "unit": null, + "value": 1254 } ] } @@ -48404,7 +52242,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -48452,27 +52290,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".", + "text": " current CEO of Meta is Mark Zuckerberg.", "type": "text" }, "event_type": { @@ -48514,16 +52332,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "drZjZkfj", + "span_id": "oB7hDf6E", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852666+00:00", + "__datetime__": "2025-03-07T01:44:07.084924+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 77 + "value": 1145 }, { "attributes": { @@ -48531,16 +52349,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "drZjZkfj", + "span_id": "oB7hDf6E", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852692+00:00", + "__datetime__": "2025-03-07T01:44:07.084934+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 23 + "value": 19 }, { "attributes": { @@ -48548,16 +52366,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "drZjZkfj", + "span_id": "oB7hDf6E", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:33.852699+00:00", + "__datetime__": "2025-03-07T01:44:07.084936+00:00", "__module__": "datetime" }, - "trace_id": "Sn0I7GFHTxKxewK2", + "trace_id": "hwA8OLUhQ1qa3ecF", "type": "metric", "unit": "tokens", - "value": 100 + "value": 1164 } ] } @@ -48565,7 +52383,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -48593,8 +52411,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -48613,8 +52436,13 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "brave_search.call(query=\"current CEO of Meta\")", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -48627,6 +52455,45 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "current CEO of Meta" + }, + "call_id": "94b2529c-a7f4-43ca-ab5c-92485d5cc6f3", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -48650,55 +52517,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.617998+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 77 + "unit": null, + "value": 34 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.618030+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 23 + "unit": null, + "value": 10 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "WMEZtUXH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:32.618036+00:00", - "__module__": "datetime" - }, - "trace_id": "f9RM1qaUTk2LvaVo", - "type": "metric", - "unit": "tokens", - "value": 100 + "unit": null, + "value": 44 } ] } @@ -48706,7 +52537,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -48754,7 +52585,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": " boiling point of polyjuice is -100 degrees Celsius.", "type": "text" }, "event_type": { @@ -48774,33 +52605,58 @@ "data": { "event": { "delta": { - "text": " able", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 77 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 + }, + { + "metric": "total_tokens", + "unit": null, + "value": 100 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " to find the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -48814,7 +52670,7 @@ "data": { "event": { "delta": { - "text": " boiling point of \"polyjuice\" as", + "text": "The", "type": "text" }, "event_type": { @@ -48834,7 +52690,7 @@ "data": { "event": { "delta": { - "text": " it", + "text": " boiling point of polyjuice is -100 degrees Celsius.", "type": "text" }, "event_type": { @@ -48854,33 +52710,58 @@ "data": { "event": { "delta": { - "text": " is not a real liquid", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": [ + { + "metric": "prompt_tokens", + "unit": null, + "value": 77 + }, + { + "metric": "completion_tokens", + "unit": null, + "value": 23 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "metric": "total_tokens", + "unit": null, + "value": 100 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": ". Polyju", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -48894,7 +52775,7 @@ "data": { "event": { "delta": { - "text": "ice is a fictional substance from the", + "text": "The", "type": "text" }, "event_type": { @@ -48914,7 +52795,7 @@ "data": { "event": { "delta": { - "text": " Harry Potter series.", + "text": " boiling point of polyjuice is -100 degrees Celsius.", "type": "text" }, "event_type": { @@ -48951,55 +52832,19 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232189+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 77 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232325+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 51 + "unit": null, + "value": 23 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "p7Vx9VAq", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:28.232334+00:00", - "__module__": "datetime" - }, - "trace_id": "WKEqFugATCeCl8mc", - "type": "metric", - "unit": "tokens", - "value": 128 + "unit": null, + "value": 100 } ] } @@ -50118,7 +53963,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": "get_boiling_point(liquid_name='polyjuice', cel", "type": "text" }, "event_type": { @@ -50138,7 +53983,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": "cius=True)]", "type": "text" }, "event_type": { @@ -50168,7 +54013,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "d43b2636-903d-430d-8389-91eefe5a1d75", + "call_id": "20aad753-f01f-42cc-bb68-aac292a707a1", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -50211,54 +54056,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221646+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221673+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "9EBiVeAT", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:32.221680+00:00", - "__module__": "datetime" - }, - "trace_id": "7kB12OwpSUOcwmJV", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 58 } ] @@ -50365,7 +54174,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "0548b2ef-daa4-4099-bb2c-b34f00752339", + "call_id": "d690dafc-3fe8-4356-ba40-5682803c9fbf", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -50408,54 +54217,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366139+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366166+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "lc3YWIQH", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:00:31.366172+00:00", - "__module__": "datetime" - }, - "trace_id": "zDQV0rn3TNKfByA0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 58 } ] @@ -50512,47 +54285,7 @@ "data": { "event": { "delta": { - "text": "juice is a fictional potion from", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the Harry Potter series by J.K. Rowling. As it", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'s not a real substance, it doesn't have a boiling point", + "text": "juice is a fictional potion from the Harry Potter series by", "type": "text" }, "event_type": { @@ -50572,7 +54305,7 @@ "data": { "event": { "delta": { - "text": ". Polyjuice Potion is a magical concoction", + "text": " J.K. Rowling. As it's not a", "type": "text" }, "event_type": { @@ -50592,7 +54325,7 @@ "data": { "event": { "delta": { - "text": " that allows the drinker to assume the form and", + "text": " real substance, it doesn't have a boiling point. Polyjuice Potion is", "type": "text" }, "event_type": { @@ -50612,7 +54345,7 @@ "data": { "event": { "delta": { - "text": " appearance", + "text": " a magical concoction that allows the drinker to assume the form and", "type": "text" }, "event_type": { @@ -50632,7 +54365,7 @@ "data": { "event": { "delta": { - "text": " of another person, but it's not a physical substance that can", + "text": " appearance of another person, but it's not a physical substance that can be measured or analyzed in", "type": "text" }, "event_type": { @@ -50652,7 +54385,7 @@ "data": { "event": { "delta": { - "text": " be measured or analyzed in the same way as real-world", + "text": " the same way as real-world", "type": "text" }, "event_type": { @@ -50692,7 +54425,7 @@ "data": { "event": { "delta": { - "text": " have any other questions or", + "text": " have any other questions or if there's anything else I can", "type": "text" }, "event_type": { @@ -50712,7 +54445,7 @@ "data": { "event": { "delta": { - "text": " if there's anything else I can help you with, feel free to ask", + "text": " help you with", "type": "text" }, "event_type": { @@ -50732,7 +54465,7 @@ "data": { "event": { "delta": { - "text": "!", + "text": ", feel free to ask!", "type": "text" }, "event_type": { @@ -50769,54 +54502,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "M0oC9v8Y", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531648+00:00", - "__module__": "datetime" - }, - "trace_id": "0CMlh2kQShSVm3zE", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "M0oC9v8Y", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531666+00:00", - "__module__": "datetime" - }, - "trace_id": "0CMlh2kQShSVm3zE", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 113 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "M0oC9v8Y", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:30.531671+00:00", - "__module__": "datetime" - }, - "trace_id": "0CMlh2kQShSVm3zE", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 143 } ] @@ -50873,7 +54570,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice', cel", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -50893,7 +54590,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -50923,7 +54620,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "acbb04a1-08f4-4277-9b66-aadda2fa2be7", + "call_id": "8671dbc5-2f70-48a3-a844-94ad7652a468", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -50966,54 +54663,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175063+00:00", - "__module__": "datetime" - }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 30 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175128+00:00", - "__module__": "datetime" - }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 28 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "jMXDDKvp", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T02:04:26.175137+00:00", - "__module__": "datetime" - }, - "trace_id": "44TwzIrGS2aqfbVn", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 58 } ] @@ -51127,54 +54788,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404182+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", - "value": 252 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, + "unit": null, + "value": 252 + }, + { "metric": "completion_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404224+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 20 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "bxIams_G", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:13.404230+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 272 } ] @@ -51266,7 +54891,57 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if n <= 3:\n return True", + "tool_call": "\n if n <=", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " ", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "3:\n return True\n if n % 2 == 0", "type": "tool_call" }, "event_type": { @@ -51291,7 +54966,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if n % 2 == 0 or n % 3", + "tool_call": " or n % 3 == 0:\n return False\n i", "type": "tool_call" }, "event_type": { @@ -51316,7 +54991,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " == 0:\n return False\n i = 5\n ", + "tool_call": " = 5\n while i * i <= n:\n if n", "type": "tool_call" }, "event_type": { @@ -51341,7 +55016,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " while i * i <= n:\n if n", + "tool_call": " % i == 0 or n % (i + 2) ==", "type": "tool_call" }, "event_type": { @@ -51366,7 +55041,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " % i == 0 or n % (i", + "tool_call": " 0:\n return False\n i += 6\n return", "type": "tool_call" }, "event_type": { @@ -51391,7 +55066,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " + 2) == 0:\n return False\n i +=", + "tool_call": " True\n\ndef nth_prime", "type": "tool_call" }, "event_type": { @@ -51416,7 +55091,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 6\n return True\n\ndef nth_prime(n):\n count =", + "tool_call": "(n):\n count = 0\n num", "type": "tool_call" }, "event_type": { @@ -51441,7 +55116,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 0\n num = 2\n while True:\n if", + "tool_call": " = 2\n while True:\n if is_prime(num):\n ", "type": "tool_call" }, "event_type": { @@ -51466,7 +55141,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " is_prime(num):\n count += 1\n if count == n", + "tool_call": " count += 1\n", "type": "tool_call" }, "event_type": { @@ -51491,7 +55166,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": ":\n return num\n num += 1\n\nprint(nth_prime", + "tool_call": " if count == n:\n return num\n", "type": "tool_call" }, "event_type": { @@ -51516,7 +55191,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "(100))", + "tool_call": " num += 1\n\nprint(nth_prime(100))", "type": "tool_call" }, "event_type": { @@ -51545,7 +55220,7 @@ "arguments": { "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" }, - "call_id": "e1110bc1-dc83-480d-ad33-09d49f5ccc8d", + "call_id": "836806e9-2184-4dac-9769-d94a496f9f95", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -51592,54 +55267,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "5J3hM-La", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121100+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 40 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "5J3hM-La", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121127+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 10 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "5J3hM-La", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:09.121132+00:00", - "__module__": "datetime" - }, - "trace_id": "snO106yxStaL10ow", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 50 } ] @@ -51696,7 +55335,27 @@ "data": { "event": { "delta": { - "text": "plexity the company was founded in 2022.", + "text": "plexity the company was founded in 202", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "2.", "type": "text" }, "event_type": { @@ -51733,54 +55392,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "6jxCq3gU", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430436+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 68 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "6jxCq3gU", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430477+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 22 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "6jxCq3gU", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:50.430489+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 90 } ] @@ -51886,7 +55509,7 @@ "arguments": { "query": "Perplexity the company founding date" }, - "call_id": "199ef050-bc11-4e4b-935d-f5241c3f40ef", + "call_id": "3c2feef6-e21b-4715-83a5-61d24d09d7b5", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -51929,54 +55552,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "m4wMGuSN", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880525+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 29 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "m4wMGuSN", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880576+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 23 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "m4wMGuSN", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:49.880585+00:00", - "__module__": "datetime" - }, - "trace_id": "XhZWljYTTDCYF7vI", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 52 } ] @@ -52110,54 +55697,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "OyfVMRgR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322420+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 63 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "OyfVMRgR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322482+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 45 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "OyfVMRgR", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:53.322490+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 108 } ] @@ -52243,7 +55794,7 @@ "arguments": { "query": "NBA creation date" }, - "call_id": "388e55ab-448a-4a98-905b-196c051bdeea", + "call_id": "2a770dea-edee-4890-b0e3-930a4cb167e3", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -52286,54 +55837,18 @@ }, "metrics": [ { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "prompt_tokens", - "span_id": "QpFMmy3B", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235138+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 27 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "completion_tokens", - "span_id": "QpFMmy3B", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235160+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 20 }, { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, "metric": "total_tokens", - "span_id": "QpFMmy3B", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:52.235165+00:00", - "__module__": "datetime" - }, - "trace_id": "TMrhR55CR-KrmGp0", - "type": "metric", - "unit": "tokens", + "unit": null, "value": 47 } ] diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index 8db8ad9661..a0cf276c2f 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -64,6 +64,19 @@ } } }, + "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { + "type": "value", + "value": { + "__module__": "llama_stack.apis.tools.tools", + "__pydantic__": "ToolInvocationResult", + "data": { + "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", + "error_code": null, + "error_message": null, + "metadata": null + } + } + }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -400,23 +413,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:bbddb\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n from torchtune.datasets import chat_dataset\n from torchtune.models.llama3 import llama3_tokenizer\n\n tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n ds = chat_dataset(\n tokenizer=tokenizer,\n source=\"json\",\n data_files=\"data/my_data.json\",\n split=\"train\",\n conversation_column=\"dialogue\",\n conversation_style=\"sharegpt\",\n )\n\n.. code-block:: yaml\n\n # In config\n tokenizer:\n _component_: torchtune.models.llama3.llama3_tokenizer\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n dataset:\n _component_: torchtune.datasets.chat_dataset\n source: json\n data_files: data/my_data.json\n split: train\n conversation_column: dialogue\n conversation_style: sharegpt\n\n.. note::\n You can pass in any keyword argument for `load_dataset `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_ into all our\n Dataset classes and they will honor them. This is useful for common parameters\n such as specifying the data split with :code:`split` or configuration with\n :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 3:\nDocument_id:90a49\nContent: ` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 4:\nDocument_id:14c27\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 5:\nDocument_id:83901\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", + "text": "Result 5:\nDocument_id:90a49\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.use_dora=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n model.apply_lora_to_mlp=True \\\n model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n model.lora_rank=16 \\\n model.lora_alpha=32 \\\n model.use_dora=True \\\n model.quantize_base=True\n\n.. code-block:: yaml\n\n model:\n _component_: torchtune.models.lora_llama3_8b\n apply_lora_to_mlp: True\n lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n lora_rank: 16\n lora_alpha: 32\n use_dora: True\n quantize_base: True\n\n\n.. note::\n\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP `.\n.. .. _glossary_fsdp2:\n\n", "type": "text" }, { @@ -428,11 +441,11 @@ "error_message": null, "metadata": { "document_ids": [ - "bbddbe62-508d-4c8d-9455-3b60bc2825a5", - "15b8638f-b1b6-4f58-adfa-eb6644c47de3", - "83901b53-33d4-4f5e-8145-b94c783e9f61", - "15b8638f-b1b6-4f58-adfa-eb6644c47de3", - "83901b53-33d4-4f5e-8145-b94c783e9f61" + "3e31d46a-5568-49d4-978e-0ab3b6598a7f", + "14c2766f-d176-468d-9124-f4a46c5166cf", + "90a49271-c246-4bdc-9db9-e3bfa9be5ff1", + "14c2766f-d176-468d-9124-f4a46c5166cf", + "90a49271-c246-4bdc-9db9-e3bfa9be5ff1" ] } } @@ -444,7 +457,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\", \"url\": \"https://observer.com/2024/01/meta-facebook-top-executives/\", \"content\": \"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\", \"score\": 0.45536873, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", + "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79080546, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05570498, \"raw_content\": null}]}", "error_code": null, "error_message": null, "metadata": null @@ -463,23 +476,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:20e5d\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:14c27\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:20e5d\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:14c27\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:20e5d\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe Date: Wed, 12 Mar 2025 23:34:31 -0700 Subject: [PATCH 14/14] revert mocks --- .../recorded_responses/chat_completion.json | 9043 +++++------------ .../recorded_responses/invoke_tool.json | 35 +- 2 files changed, 2775 insertions(+), 6303 deletions(-) diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json index 80744576ea..8694cc2713 100644 --- a/tests/integration/fixtures/recorded_responses/chat_completion.json +++ b/tests/integration/fixtures/recorded_responses/chat_completion.json @@ -39542,7 +39542,7 @@ "data": { "event": { "delta": { - "text": " provided function definitions are", + "text": " provided function definitions", "type": "text" }, "event_type": { @@ -39562,7 +39562,7 @@ "data": { "event": { "delta": { - "text": " not suitable for this task", + "text": " are not suitable", "type": "text" }, "event_type": { @@ -39582,7 +39582,7 @@ "data": { "event": { "delta": { - "text": ". Please rework them", + "text": " for this task. Please re", "type": "text" }, "event_type": { @@ -39602,7 +39602,27 @@ "data": { "event": { "delta": { - "text": " to align with the task requirements.", + "text": "work them to", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " align with the task requirements.", "type": "text" }, "event_type": { @@ -39639,18 +39659,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "D2n_IS_8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:32.021393+00:00", + "__module__": "datetime" + }, + "trace_id": "amAiZv5PQKSsA74j", + "type": "metric", + "unit": "tokens", "value": 90 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "D2n_IS_8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:32.021420+00:00", + "__module__": "datetime" + }, + "trace_id": "amAiZv5PQKSsA74j", + "type": "metric", + "unit": "tokens", "value": 32 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "D2n_IS_8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:32.021427+00:00", + "__module__": "datetime" + }, + "trace_id": "amAiZv5PQKSsA74j", + "type": "metric", + "unit": "tokens", "value": 122 } ] @@ -40495,7 +40551,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice', celcius", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -40515,7 +40571,7 @@ "data": { "event": { "delta": { - "text": "=True)]", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -40545,7 +40601,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "152ff1b5-7bcf-4d46-8c8f-e5c13f646925", + "call_id": "3955f756-9aa0-433f-be8f-af8941c220de", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -40588,18 +40644,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "QZ6PSGpT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:29.629456+00:00", + "__module__": "datetime" + }, + "trace_id": "M72bosg8TBe3uhx3", + "type": "metric", + "unit": "tokens", "value": 43 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "QZ6PSGpT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:29.629488+00:00", + "__module__": "datetime" + }, + "trace_id": "M72bosg8TBe3uhx3", + "type": "metric", + "unit": "tokens", "value": 28 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "QZ6PSGpT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:03:29.629494+00:00", + "__module__": "datetime" + }, + "trace_id": "M72bosg8TBe3uhx3", + "type": "metric", + "unit": "tokens", "value": 71 } ] @@ -40656,7 +40748,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point was called with", + "text": " function call returned an", "type": "text" }, "event_type": { @@ -40676,7 +40768,7 @@ "data": { "event": { "delta": { - "text": " the parameters liquid_name = \"polyjuice\" and", + "text": " error since", "type": "text" }, "event_type": { @@ -40696,7 +40788,7 @@ "data": { "event": { "delta": { - "text": " celcius = True. However, the function returned -", + "text": " \"", "type": "text" }, "event_type": { @@ -40716,7 +40808,7 @@ "data": { "event": { "delta": { - "text": "100, which is not a valid", + "text": "polyjuice\" is", "type": "text" }, "event_type": { @@ -40736,7 +40828,7 @@ "data": { "event": { "delta": { - "text": " boiling point. This suggests that the function does not", + "text": " not a real liquid. Polyju", "type": "text" }, "event_type": { @@ -40756,7 +40848,7 @@ "data": { "event": { "delta": { - "text": " have the boiling point of \"poly", + "text": "ice is a fictional substance from the", "type": "text" }, "event_type": { @@ -40776,7 +40868,7 @@ "data": { "event": { "delta": { - "text": "juice\" in its database.", + "text": " Harry Potter series. The boiling point", "type": "text" }, "event_type": { @@ -40796,58 +40888,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " of a substance is a physical", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 84 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 73 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 157 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " property that can be measured and", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -40861,7 +40928,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " quantified", "type": "text" }, "event_type": { @@ -40881,7 +40948,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point is not", + "text": ", but it only applies", "type": "text" }, "event_type": { @@ -40901,7 +40968,7 @@ "data": { "event": { "delta": { - "text": " recognized.", + "text": " to real substances that exist in the physical world.", "type": "text" }, "event_type": { @@ -40943,16 +41010,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "y9SHtJTQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401637+00:00", + "__datetime__": "2025-03-07T02:05:01.411612+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "_I2Cu85IRtOSBSX9", "type": "metric", "unit": "tokens", - "value": 93 + "value": 84 }, { "attributes": { @@ -40960,16 +41027,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "y9SHtJTQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401666+00:00", + "__datetime__": "2025-03-07T02:05:01.411644+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "_I2Cu85IRtOSBSX9", "type": "metric", "unit": "tokens", - "value": 20 + "value": 73 }, { "attributes": { @@ -40977,16 +41044,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "Z7jBGJ-8", + "span_id": "y9SHtJTQ", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:55.401670+00:00", + "__datetime__": "2025-03-07T02:05:01.411650+00:00", "__module__": "datetime" }, - "trace_id": "WxMAq579Q-ixJ3wJ", + "trace_id": "_I2Cu85IRtOSBSX9", "type": "metric", "unit": "tokens", - "value": 113 + "value": 157 } ] } @@ -40994,7 +41061,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -41042,7 +41109,7 @@ "data": { "event": { "delta": { - "text": " function call should have been", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -41062,7 +41129,7 @@ "data": { "event": { "delta": { - "text": " \n[get_boiling_point_with", + "text": " recognized.", "type": "text" }, "event_type": { @@ -41082,53 +41149,94 @@ "data": { "event": { "delta": { - "text": "_metadata(liquid_name='polyjuice', celci", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "us=True)] \nHowever since the", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "Z7jBGJ-8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:55.401637+00:00", + "__module__": "datetime" + }, + "trace_id": "WxMAq579Q-ixJ3wJ", + "type": "metric", + "unit": "tokens", + "value": 93 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "Z7jBGJ-8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:55.401666+00:00", + "__module__": "datetime" + }, + "trace_id": "WxMAq579Q-ixJ3wJ", + "type": "metric", + "unit": "tokens", + "value": 20 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "Z7jBGJ-8", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:55.401670+00:00", + "__module__": "datetime" + }, + "trace_id": "WxMAq579Q-ixJ3wJ", + "type": "metric", + "unit": "tokens", + "value": 113 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " function get_boiling_point_with_metadata does not", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -41142,7 +41250,7 @@ "data": { "event": { "delta": { - "text": " exist in the ipython environment,", + "text": "The", "type": "text" }, "event_type": { @@ -41162,7 +41270,7 @@ "data": { "event": { "delta": { - "text": " it will throw an", + "text": " function get_bo", "type": "text" }, "event_type": { @@ -41182,7 +41290,7 @@ "data": { "event": { "delta": { - "text": " error. \n\nIn order to", + "text": "iling_point_with_metadata does not exist,", "type": "text" }, "event_type": { @@ -41202,7 +41310,7 @@ "data": { "event": { "delta": { - "text": " get the correct answer, you would need to define the function", + "text": " I will", "type": "text" }, "event_type": { @@ -41222,7 +41330,7 @@ "data": { "event": { "delta": { - "text": " get_boiling_point_with_metadata first. \n\nHere is how you", + "text": " assume you", "type": "text" }, "event_type": { @@ -41242,7 +41350,7 @@ "data": { "event": { "delta": { - "text": " can define it:\n\n```python\ndef get_boiling", + "text": " meant get_bo", "type": "text" }, "event_type": { @@ -41262,7 +41370,7 @@ "data": { "event": { "delta": { - "text": "_point_with_metadata(liquid_name, celcius=True):\n", + "text": "iling_point_with_metadata", "type": "text" }, "event_type": { @@ -41282,7 +41390,7 @@ "data": { "event": { "delta": { - "text": " # This is a mock implementation,", + "text": ". The boiling point of polyjuice", "type": "text" }, "event_type": { @@ -41302,7 +41410,7 @@ "data": { "event": { "delta": { - "text": " you would need a real", + "text": " is -100.", "type": "text" }, "event_type": { @@ -41322,33 +41430,94 @@ "data": { "event": { "delta": { - "text": " database of boiling points\n boiling_points", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " = {\n \"water\": 100,\n \"polyjuice\":", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "8dM6i5mO", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:03.329281+00:00", + "__module__": "datetime" + }, + "trace_id": "zMJDP5dXRrChi7uE", + "type": "metric", + "unit": "tokens", + "value": 86 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "8dM6i5mO", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:03.329312+00:00", + "__module__": "datetime" + }, + "trace_id": "zMJDP5dXRrChi7uE", + "type": "metric", + "unit": "tokens", + "value": 45 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "8dM6i5mO", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:05:03.329318+00:00", + "__module__": "datetime" + }, + "trace_id": "zMJDP5dXRrChi7uE", + "type": "metric", + "unit": "tokens", + "value": 131 + } + ] + } + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "chunks": [ + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -41362,7 +41531,7 @@ "data": { "event": { "delta": { - "text": " 120 #", + "text": "The", "type": "text" }, "event_type": { @@ -41382,7 +41551,7 @@ "data": { "event": { "delta": { - "text": " This is fictional, polyjuice is not a", + "text": " function get_boiling_point_with_metadata(", "type": "text" }, "event_type": { @@ -41402,7 +41571,7 @@ "data": { "event": { "delta": { - "text": " real liquid\n", + "text": "liquid_name=\"polyjuice\", celcius=True) should be", "type": "text" }, "event_type": { @@ -41422,7 +41591,7 @@ "data": { "event": { "delta": { - "text": " }\n \n if liquid_name in boiling", + "text": " used to get the answer.", "type": "text" }, "event_type": { @@ -41442,33 +41611,94 @@ "data": { "event": { "delta": { - "text": "_points:\n if celcius:\n return boiling_points[liquid_name", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "pzQMKAJc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:56.809816+00:00", + "__module__": "datetime" + }, + "trace_id": "018KkGcOThSSiZfE", + "type": "metric", + "unit": "tokens", + "value": 97 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "pzQMKAJc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:56.809911+00:00", + "__module__": "datetime" + }, + "trace_id": "018KkGcOThSSiZfE", + "type": "metric", + "unit": "tokens", + "value": 39 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "pzQMKAJc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:56.809922+00:00", + "__module__": "datetime" + }, + "trace_id": "018KkGcOThSSiZfE", + "type": "metric", + "unit": "tokens", + "value": 136 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "]\n else:\n return boiling_points[liquid_name] *", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -41482,7 +41712,7 @@ "data": { "event": { "delta": { - "text": " 9/5 + 32\n else:\n", + "text": "[", "type": "text" }, "event_type": { @@ -41502,7 +41732,7 @@ "data": { "event": { "delta": { - "text": " return \"Boiling point not found\"\n```\n\nThen you", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -41522,7 +41752,7 @@ "data": { "event": { "delta": { - "text": " can call the function:\n\n```python\nprint(get_bo", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -41542,8 +41772,20 @@ "data": { "event": { "delta": { - "text": "iling_point_with_metadata(\"polyjuice\"))\n```", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "328cb19d-47bb-47cc-8258-a5ca2e26803e", + "tool_name": "get_boiling_point" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -41551,7 +41793,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -41579,19 +41825,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 86 + "span_id": "dS0bhfN_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:53.324788+00:00", + "__module__": "datetime" + }, + "trace_id": "UJz5Cas1SDyQYeBk", + "type": "metric", + "unit": "tokens", + "value": 37 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 250 + "span_id": "dS0bhfN_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:53.324835+00:00", + "__module__": "datetime" + }, + "trace_id": "UJz5Cas1SDyQYeBk", + "type": "metric", + "unit": "tokens", + "value": 28 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 336 + "span_id": "dS0bhfN_", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:53.324844+00:00", + "__module__": "datetime" + }, + "trace_id": "UJz5Cas1SDyQYeBk", + "type": "metric", + "unit": "tokens", + "value": 65 } ] } @@ -41599,7 +41881,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"Unknown tool `get_boiling_point_with_metadata` was called.\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point_with_metadata\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -41627,7 +41909,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "[", "type": "text" }, "event_type": { @@ -41647,7 +41929,7 @@ "data": { "event": { "delta": { - "text": " function get_boiling_point_with_metadata(", + "text": "get_boiling_point_with_metadata", "type": "text" }, "event_type": { @@ -41667,7 +41949,7 @@ "data": { "event": { "delta": { - "text": "liquid_name=\"polyjuice\", celcius=True) should be", + "text": "(liquid_name='polyjuice', cel", "type": "text" }, "event_type": { @@ -41687,7 +41969,7 @@ "data": { "event": { "delta": { - "text": " used to get the answer.", + "text": "cius=True)]", "type": "text" }, "event_type": { @@ -41701,6 +41983,42 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "celcius": true, + "liquid_name": "polyjuice" + }, + "call_id": "5bb48d00-7d5c-49e2-bddf-e5fdc5f35485", + "tool_name": "get_boiling_point_with_metadata" + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -41729,16 +42047,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "pzQMKAJc", + "span_id": "mfrFN7m2", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809816+00:00", + "__datetime__": "2025-03-07T02:05:02.136501+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "T4eddr4-SMWPQwKA", "type": "metric", "unit": "tokens", - "value": 97 + "value": 37 }, { "attributes": { @@ -41746,16 +42064,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "pzQMKAJc", + "span_id": "mfrFN7m2", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809911+00:00", + "__datetime__": "2025-03-07T02:05:02.136529+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "T4eddr4-SMWPQwKA", "type": "metric", "unit": "tokens", - "value": 39 + "value": 30 }, { "attributes": { @@ -41763,16 +42081,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "pzQMKAJc", + "span_id": "mfrFN7m2", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:56.809922+00:00", + "__datetime__": "2025-03-07T02:05:02.136535+00:00", "__module__": "datetime" }, - "trace_id": "018KkGcOThSSiZfE", + "trace_id": "T4eddr4-SMWPQwKA", "type": "metric", "unit": "tokens", - "value": 136 + "value": 67 } ] } @@ -41780,7 +42098,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -41808,7 +42126,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "When", "type": "text" }, "event_type": { @@ -41828,7 +42146,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice', cel", + "text": " I answered the", "type": "text" }, "event_type": { @@ -41848,7 +42166,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": " phone, the friendly", "type": "text" }, "event_type": { @@ -41868,20 +42186,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "2dece34e-68c3-43a9-b685-e229569135ab", - "tool_name": "get_boiling_point" - }, - "type": "tool_call" + "text": " voice on the other end said \"hello\"", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -41889,11 +42195,27 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " and asked how I was doing.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -41921,19 +42243,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 37 + "span_id": "tJEuRhla", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:01.044284+00:00", + "__module__": "datetime" + }, + "trace_id": "bnDS7Z41TRO0UyfH", + "type": "metric", + "unit": "tokens", + "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 28 + "span_id": "tJEuRhla", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:01.044312+00:00", + "__module__": "datetime" + }, + "trace_id": "bnDS7Z41TRO0UyfH", + "type": "metric", + "unit": "tokens", + "value": 34 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 65 + "span_id": "tJEuRhla", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:01.044318+00:00", + "__module__": "datetime" + }, + "trace_id": "bnDS7Z41TRO0UyfH", + "type": "metric", + "unit": "tokens", + "value": 64 } ] } @@ -41941,7 +42299,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -41969,7 +42327,7 @@ "data": { "event": { "delta": { - "text": "[", + "text": "I", "type": "text" }, "event_type": { @@ -41989,7 +42347,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point_with_metadata(liquid_name='poly", + "text": " am not able", "type": "text" }, "event_type": { @@ -42009,7 +42367,7 @@ "data": { "event": { "delta": { - "text": "juice', celcius=True)]", + "text": " to execute this task as", "type": "text" }, "event_type": { @@ -42029,20 +42387,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "celcius": true, - "liquid_name": "polyjuice" - }, - "call_id": "24730d85-818c-4719-8a28-903160341849", - "tool_name": "get_boiling_point_with_metadata" - }, - "type": "tool_call" + "text": " it exceeds the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42050,11 +42396,47 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " limitations of the functions I", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " have been given.", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null }, "metrics": null } @@ -42082,19 +42464,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 37 + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 433 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 30 + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 31 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 67 + "span_id": "5If5go-q", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 464 } ] } @@ -42102,7 +42520,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Give me a sentence that contains the word: hello\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": []}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -42130,8 +42548,13 @@ "data": { "event": { "delta": { - "text": "When", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42150,8 +42573,13 @@ "data": { "event": { "delta": { - "text": " I answered the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf =", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42170,8 +42598,13 @@ "data": { "event": { "delta": { - "text": " phone, the friendly voice on the other end said", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " pd.read_csv(\"/var/folders/rb/qv8vwgyj", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42190,8 +42623,13 @@ "data": { "event": { "delta": { - "text": " \"hello\" and asked how I was doing", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "6yjd3t4pwsy9t0rm0000", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42210,8 +42648,13 @@ "data": { "event": { "delta": { - "text": ".", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "gn/T/tmp2x_sml66/ZEjbinQHin", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42230,58 +42673,18 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation.csv\")\n# Rows\nprint(\"Number of rows and columns in the", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 30 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 34 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 64 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -42295,8 +42698,13 @@ "data": { "event": { "delta": { - "text": "I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42315,8 +42723,13 @@ "data": { "event": { "delta": { - "text": " am not able", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42335,8 +42748,13 @@ "data": { "event": { "delta": { - "text": " to execute this task as", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42355,8 +42773,13 @@ "data": { "event": { "delta": { - "text": " it exceeds the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42375,8 +42798,13 @@ "data": { "event": { "delta": { - "text": " limitations of the functions I", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "print(df.head())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42395,8 +42823,23 @@ "data": { "event": { "delta": { - "text": " have been given.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "1df8b196-9eff-4b06-97e7-ab175c741e8f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42404,7 +42847,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -42437,16 +42884,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "5If5go-q", + "span_id": "fLqIbpek", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070675+00:00", + "__datetime__": "2025-03-07T01:45:40.262304+00:00", "__module__": "datetime" }, "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 433 + "value": 235 }, { "attributes": { @@ -42454,16 +42901,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "5If5go-q", + "span_id": "fLqIbpek", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070742+00:00", + "__datetime__": "2025-03-07T01:45:40.262340+00:00", "__module__": "datetime" }, "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 31 + "value": 10 }, { "attributes": { @@ -42471,16 +42918,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "5If5go-q", + "span_id": "fLqIbpek", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:48.070750+00:00", + "__datetime__": "2025-03-07T01:45:40.262347+00:00", "__module__": "datetime" }, "trace_id": "StUjhrTMQKKQSRvS", "type": "metric", "unit": "tokens", - "value": 464 + "value": 245 } ] } @@ -42488,7 +42935,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n# Load data\\ndf = pd.read_csv(\\\"\")\\n# Rows\\nprint(\\\"Number of rows and columns in the data:\\\", df.shape)\\n# Columns\\nprint(\\\"Columns of the data are:\\\", len(df.columns))\\n# Column names\\nprint(\\\"Columns of the data are:\\\", df.columns)\\n# Column dtypes\\nprint(\\\"Datatype of the columns are:\\\", df.dtypes)\\n# Sample of data\\nprint(\\\"Data sample from file:\\\")\\nprint(df.head())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -42516,8 +42963,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42536,8 +42988,13 @@ "data": { "event": { "delta": { - "text": " error message indicates that the file 'bwrap' was not", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n# Load data\ndf = pd", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42556,8 +43013,13 @@ "data": { "event": { "delta": { - "text": " found. This is likely because the file path provided", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42576,8 +43038,13 @@ "data": { "event": { "delta": { - "text": " is incorrect or the file does not exist in", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "pwsy9t0rm0000gn/T/tmp2x_sml66/ZEj", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42596,8 +43063,13 @@ "data": { "event": { "delta": { - "text": " the current working directory.\n\nTo resolve this", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "binQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42616,8 +43088,13 @@ "data": { "event": { "delta": { - "text": " issue, you can try the following:\n\n1. Check", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ":\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n#", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42636,8 +43113,13 @@ "data": { "event": { "delta": { - "text": " the file path: Ensure that the file path provided is", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42656,8 +43138,13 @@ "data": { "event": { "delta": { - "text": " correct and the file exists in the specified location.\n2. Use the correct", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "(\"Datatype of the columns are:\", df.dtypes)\n# Sample", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42676,8 +43163,13 @@ "data": { "event": { "delta": { - "text": " file path: If the file is located in a different directory,", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " of data\nprint(\"Data sample from file:\")\nprint(df.head())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42696,8 +43188,23 @@ "data": { "event": { "delta": { - "text": " provide the correct file path.\n3. Check the file name: Ensure that", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/ZEjbinQHinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" + }, + "call_id": "c1708ded-f272-4008-b91f-19d61780c394", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42705,7 +43212,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -42716,53 +43227,94 @@ "data": { "event": { "delta": { - "text": " the file name is correct and matches the one provided", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " in the code.\n4. Use the absolute", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305765+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 37 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305820+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 10 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "KTMayjIE", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:37.305832+00:00", + "__module__": "datetime" + }, + "trace_id": "StUjhrTMQKKQSRvS", + "type": "metric", + "unit": "tokens", + "value": 47 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " file path: Instead of", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -42776,7 +43328,7 @@ "data": { "event": { "delta": { - "text": " using a relative file path, try using the absolute file path to", + "text": "The", "type": "text" }, "event_type": { @@ -42796,7 +43348,7 @@ "data": { "event": { "delta": { - "text": " the file.\n\nIf you are", + "text": " error message indicates that the file \"/var/folders/rb/qv8", "type": "text" }, "event_type": { @@ -42816,7 +43368,7 @@ "data": { "event": { "delta": { - "text": " still encountering issues, please provide more details about", + "text": "vwgyj6yjd3t4pwsy9t0", "type": "text" }, "event_type": { @@ -42836,7 +43388,7 @@ "data": { "event": { "delta": { - "text": " the file and its location, and I'll be happy to assist you further", + "text": "rm0000gn/T/tmp2x_sml66/9vY", "type": "text" }, "event_type": { @@ -42856,7 +43408,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": "vmVRoinflation.csv\" does not exist. This could be due to", "type": "text" }, "event_type": { @@ -42876,58 +43428,13 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 238 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 183 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 421 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv file, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", + "text": " a variety of reasons such as the file being deleted, the path being incorrect", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -42941,13 +43448,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42966,13 +43468,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var", - "type": "tool_call" + "text": " try the following:\n\n1. Check the file path: Ensure that the file", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -42991,13 +43488,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/folders/rb/qvqvwgyj6yjd3", - "type": "tool_call" + "text": " path is correct and the file exists at that location.\n2. Check file permissions:", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43016,13 +43508,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "t4pwsy9t0rm0000gn/T/tmpcf_0q3u", - "type": "tool_call" + "text": " Ensure that the file is accessible and you have the necessary permissions to", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43041,13 +43528,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "1/q2GJtbK2inflation.csv\")\n# Rows\nprint(\"Number of rows", - "type": "tool_call" + "text": " read it.\n3. Try a different file: If the file is not", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43066,13 +43548,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\",", - "type": "tool_call" + "text": " accessible, try loading a different file to see if the issue is specific to", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43091,13 +43568,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n#", - "type": "tool_call" + "text": " this file or a general issue with your code.\n4. Check for ty", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43116,13 +43588,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n#", - "type": "tool_call" + "text": "pos: Ensure that there are no typos in the file path or the", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43141,13 +43608,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())", - "type": "tool_call" + "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43166,23 +43628,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/rb/qvqvwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpcf_0q3u1/q2GJtbK2inflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)\n# Sample of data\nprint(\"Data sample from file:\")\nprint(df.head())" - }, - "call_id": "e57e8c8f-b440-46c6-aa6e-3d0e743e2fd9", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" + "text": " you are using, and I'll be happy to help further.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43190,11 +43637,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -43222,19 +43665,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 37 + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262530+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 680 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 10 + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262555+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 238 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 47 + "span_id": "f28sT2i7", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:23.262558+00:00", + "__module__": "datetime" + }, + "trace_id": "8YKzpfybSiGgrHOF", + "type": "metric", + "unit": "tokens", + "value": 918 } ] } @@ -43242,7 +43721,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -43270,8 +43749,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43290,8 +43774,13 @@ "data": { "event": { "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43310,8 +43799,13 @@ "data": { "event": { "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43330,8 +43824,13 @@ "data": { "event": { "delta": { - "text": "rm0000gn/T/tmp2x_sml66/9vY", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "8vwgyj6yjd3t4pwsy9t", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43350,8 +43849,13 @@ "data": { "event": { "delta": { - "text": "vmVRoinflation.csv\" does not exist. This could be due to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43370,8 +43874,13 @@ "data": { "event": { "delta": { - "text": " a variety of reasons such as the file being deleted, the path being incorrect", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43390,8 +43899,13 @@ "data": { "event": { "delta": { - "text": ", or the file not being accessible.\n\nTo resolve this issue, you can", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43410,8 +43924,13 @@ "data": { "event": { "delta": { - "text": " try the following:\n\n1. Check the file path: Ensure that the file", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "'], format='%Y')\n\n# Group by", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43430,8 +43949,13 @@ "data": { "event": { "delta": { - "text": " path is correct and the file exists at that location.\n2. Check file permissions:", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43450,8 +43974,13 @@ "data": { "event": { "delta": { - "text": " Ensure that the file is accessible and you have the necessary permissions to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43470,8 +43999,13 @@ "data": { "event": { "delta": { - "text": " read it.\n3. Try a different file: If the file is not", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Plot the average yearly inflation as a time series\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43490,8 +44024,13 @@ "data": { "event": { "delta": { - "text": " accessible, try loading a different file to see if the issue is specific to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43510,8 +44049,13 @@ "data": { "event": { "delta": { - "text": " this file or a general issue with your code.\n4. Check for ty", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "Year'], df_avg_in", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43530,8 +44074,13 @@ "data": { "event": { "delta": { - "text": "pos: Ensure that there are no typos in the file path or the", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "flation['Inflation'], marker='o')\nplt", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43550,8 +44099,13 @@ "data": { "event": { "delta": { - "text": " code.\n\nIf you are still having issues, please provide more details about the file and the code", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43570,8 +44124,13 @@ "data": { "event": { "delta": { - "text": " you are using, and I'll be happy to help further.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43584,6 +44143,45 @@ "metrics": null } }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" + }, + "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } + }, + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } + }, + "metrics": null + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -43612,16 +44210,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "f28sT2i7", + "span_id": "qQY5sAli", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262530+00:00", + "__datetime__": "2025-03-07T01:44:21.953806+00:00", "__module__": "datetime" }, "trace_id": "8YKzpfybSiGgrHOF", "type": "metric", "unit": "tokens", - "value": 680 + "value": 432 }, { "attributes": { @@ -43629,16 +44227,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "f28sT2i7", + "span_id": "qQY5sAli", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262555+00:00", + "__datetime__": "2025-03-07T01:44:21.953843+00:00", "__module__": "datetime" }, "trace_id": "8YKzpfybSiGgrHOF", "type": "metric", "unit": "tokens", - "value": 238 + "value": 10 }, { "attributes": { @@ -43646,16 +44244,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "f28sT2i7", + "span_id": "qQY5sAli", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:23.262558+00:00", + "__datetime__": "2025-03-07T01:44:21.953847+00:00", "__module__": "datetime" }, "trace_id": "8YKzpfybSiGgrHOF", "type": "metric", "unit": "tokens", - "value": 918 + "value": 442 } ] } @@ -43663,7 +44261,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a variety of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Ensure that the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not accessible, try loading a different file to see if the issue is specific to this file or a general issue with your code.\\n4. Check for typos: Ensure that there are no typos in the file path or the code.\\n\\nIf you are still having issues, please provide more details about the file and the code you are using, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -43691,13 +44289,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43716,13 +44309,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" + "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43741,13 +44329,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" + "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43766,13 +44349,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "8vwgyj6yjd3t4pwsy9t", - "type": "tool_call" + "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43791,13 +44369,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm0000gn/T/tmp2x_sml66/9v", - "type": "tool_call" + "text": " due to a variety of", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43816,13 +44389,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "YvmVRoinflation.csv\")\n\n# Convert the 'Year'", - "type": "tool_call" + "text": " reasons such as the file", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43841,13 +44409,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " column to datetime\ndf['Year'] = pd.to_datetime(df['Year", - "type": "tool_call" + "text": " being deleted, the path being incorrect, or the file", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43866,13 +44429,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "'], format='%Y')\n\n# Group by", - "type": "tool_call" + "text": " not being accessible.\n\nTo resolve this issue, you can try", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -43891,4725 +44449,7 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation as a time series\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'], df_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['Inflation'], marker='o')\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "f4efa2d4-e4e7-4ea1-8c5e-6a78bec5816f", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953806+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 432 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953843+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 10 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "qQY5sAli", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:21.953847+00:00", - "__module__": "datetime" - }, - "trace_id": "8YKzpfybSiGgrHOF", - "type": "metric", - "unit": "tokens", - "value": 442 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not available, try using a different file or creating a sample file to test the code.\\n\\nIf you are still having issues, please provide more details about the file and the environment you are running the code in, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "rm0000gn/T/tmpk42nx919/quNW0bvlin", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "flation.csv\" does not exist. This could be due to a number of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " reasons such as the file being deleted, the path being incorrect, or the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " file not being accessible.\n\nTo resolve this issue,", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " you can try the following:\n\n1. Check the file path: Make sure", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file path is correct and the file exists at that location.\n2.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Check file permissions: Ensure that the file is accessible and you have the necessary", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " permissions to read it.\n3. Try a different file: If the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " is not available, try using a different file or creating a sample file to", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " test the code.\n\nIf you are still having issues, please provide more details", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " about the file and the environment you are running the code in, and I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'ll be happy to help further.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 644 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 208 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 852 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons such as the file being deleted, the path being incorrect, or the file not being accessible.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the file is accessible and you have the necessary permissions to read it.\\n3. Try a different file: If the file is not available, try using a different file or creating a sample file to test the code.\\n\\nIf you are still having issues, please provide more details about the file and the environment you are running the code in, and I'll be happy to help further.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "8vwgyj6yjd3t4pwsy9t", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "0rm0000gn/T/tmpk42nx919/quNW0bv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "linflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "['Year'] = pd.to_datetime(df['Year'], format", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "='%Y')\n\n# Group by 'Year' and calculate", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the average inflation\ndf_avg_inflation = df.groupby('", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " inflation as a time series\nplt.figure(figsize=(10", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ",6))\nplt.plot(df_avg_inflation['Year'], df_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation['Inflation'], marker='o')\nplt.title('Average Yearly", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".grid(True)\nplt.show()", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpk42nx919/quNW0bvlinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "dcc87d01-280c-4aef-9d8a-4295ddaff6c6", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 399 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 10 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 409 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the user running the script has read permissions for the file.\\n3. Use a try-except block: You can use a try-except block to catch the FileNotFoundError and handle it accordingly.\\n\\nHere's an example of how you can modify the code to handle the error:\\n\\n```\\nimport pandas as pd\\n\\ntry:\\n df = pd.read_csv(\\\"\")\\n print(df.head())\\n print(df.info())\\n print(df.describe())\\nexcept FileNotFoundError:\\n print(\\\"The file does not exist\\\")\\n```\\n\\nThis code will print \\\"The file does not exist\\\" if the file is not found, instead of raising an error.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\nimport matplotlib.pyplot as plt\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Convert the 'Year' column to datetime\\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\\n\\n# Group by 'Year' and calculate the average inflation\\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\\n\\n# Plot the average yearly inflation as a time series\\nplt.figure(figsize=(10,6))\\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\\nplt.title('Average Yearly Inflation')\\nplt.xlabel('Year')\\nplt.ylabel('Inflation')\\nplt.grid(True)\\nplt.show()\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "rm0000gn/T/tmpaxhbg_k1/YkD5", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "cgQZinflation.csv\" does not exist. This could be due", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " to a number of reasons, such as the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " being deleted or moved, or the path being incorrect.\n\nTo resolve this issue", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", you can try the following:\n\n1. Check the file path: Make", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " sure the file path is correct and the file exists at that location.\n2", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ". Check file permissions: Ensure that the user running the script has read permissions", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " for the file.\n3", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ". Use a try-except block: You can", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " use a try-except block to catch the FileNotFoundError", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " and handle it accordingly.\n\nHere's an example of how you can modify the", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " code to handle the error:\n\n```\nimport pandas as pd\nimport matplotlib", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ".pyplot as plt\n\ntry:\n df = pd.read_csv(\"/var", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "/folders/rb/qv8vwgyj6yjd3t", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "4pwsy9t0rm0000", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "gn/T/tmpaxhbg_k1/YkD5cgQZ", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "inflation.csv\")\n df['Year'] = pd.to_datetime(df['", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Year'], format='%Y')\n df_avg_inflation = df.groupby('", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Year')['Inflation'].mean().reset_index()\n plt.figure(figsize=(", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "10,6))\n plt.plot(df_avg_inflation['Year'], df", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "_avg_inflation['Inflation'], marker='o')\n plt.title('", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "Average Yearly Inflation')\n plt.xlabel('Year')\n plt.ylabel", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "('Inflation')\n plt.grid(True)\n plt.show()\nexcept FileNotFoundError", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ":\n print(\"The file does not exist\")\n```\n\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "This code will print \"The file does not exist\" if the file is", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " not found, instead of raising an error.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 745 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 391 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 1136 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"The error message indicates that the file \\\"\" does not exist. This could be due to a number of reasons, such as the file being deleted or moved, or the path being incorrect.\\n\\nTo resolve this issue, you can try the following:\\n\\n1. Check the file path: Make sure the file path is correct and the file exists at that location.\\n2. Check file permissions: Ensure that the user running the script has read permissions for the file.\\n3. Use a try-except block: You can use a try-except block to catch the FileNotFoundError and handle it accordingly.\\n\\nHere's an example of how you can modify the code to handle the error:\\n\\n```\\nimport pandas as pd\\n\\ntry:\\n df = pd.read_csv(\\\"\")\\n print(df.head())\\n print(df.info())\\n print(df.describe())\\nexcept FileNotFoundError:\\n print(\\\"The file does not exist\\\")\\n```\\n\\nThis code will print \\\"The file does not exist\\\" if the file is not found, instead of raising an error.\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Plot average yearly inflation as a time series\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " file\ndf = pd.read_csv(\"/var/folders/rb/qv", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "8vwgyj6yjd3t4", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "pwsy9t0rm0000gn", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/T/tmpaxhbg_k1/YkD5cgQZin", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation.csv\")\n\n# Convert the 'Year' column", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " to datetime\ndf['Year'] = pd.to_datetime(df['Year'],", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " format='%Y')\n\n# Group by", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " 'Year' and calculate the average inflation\ndf_avg_in", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "flation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "# Plot the average yearly inflation as a time series\n", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "plt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "Year'], df_avg_inflation['Inflation'], marker='o')\nplt", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "('Inflation')\nplt.grid(True)\nplt.show()", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpaxhbg_k1/YkD5cgQZinflation.csv\")\n\n# Convert the 'Year' column to datetime\ndf['Year'] = pd.to_datetime(df['Year'], format='%Y')\n\n# Group by 'Year' and calculate the average inflation\ndf_avg_inflation = df.groupby('Year')['Inflation'].mean().reset_index()\n\n# Plot the average yearly inflation as a time series\nplt.figure(figsize=(10,6))\nplt.plot(df_avg_inflation['Year'], df_avg_inflation['Inflation'], marker='o')\nplt.title('Average Yearly Inflation')\nplt.xlabel('Year')\nplt.ylabel('Inflation')\nplt.grid(True)\nplt.show()" - }, - "call_id": "947831c7-8279-4f5b-8cfd-c30697382127", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 496 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 10 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 506 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics about the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8vwgyj6y", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "jd3t4pwsy9t0rm0000gn/T/tmp2x_sml", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "66/9vYvmVRoinflation.csv\" does not exist. This could be", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " due to a variety of", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " reasons such as the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " being deleted, the path being incorrect, or the file", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " not being accessible.\n\nTo resolve this issue, you can try", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the following:\n\n1. Check the file path: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file path is correct and the file exists at that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " location.\n2. Check file permissions: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file is accessible and", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " you have the necessary permissions to read", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " it.\n3. Try a different file: If", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file is not accessible, try loading a different file to see", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " if the issue is specific to this file or a general", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " issue with your code.\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "4. Check for typos: Ensure that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " there are no typos in the file path or the code.\n\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "If you are", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " still having issues, please provide more details about", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the file and the code you are using", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", and I'll be happy to help further.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630894+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 192 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630987+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 238 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "KwfNrQLy", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:19.630996+00:00", - "__module__": "datetime" - }, - "trace_id": "kNsljyzfQV2Cn4aZ", - "type": "metric", - "unit": "tokens", - "value": 430 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"error\\n[stdout]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stdout]\\n[stderr]\\n[Errno 2] No such file or directory: 'bwrap'\\n[/stderr]\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " error message indicates that the file \"/var/folders/rb/qv8", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "vwgyj6yjd3t4pwsy9t0", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "rm0000gn/T/tmpaxhbg_k1/YkD5", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "cgQZinflation.csv\" does not exist. This could be", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " due to a number of reasons, such as the file being deleted or moved", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ", or the path being incorrect.\n\nTo resolve this", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " issue, you can try the following:\n\n1. Check the file path:", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " Make sure the file path is correct and the file exists at that", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " location.\n2. Check file permissions", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ": Ensure that the user running the script has read permissions", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " for the file.\n3. Use a", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " try-except block: You can use a try-except", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " block to catch the FileNotFoundError and handle it accordingly.\n\nHere's an", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " example of how you can modify the code to handle the error", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": ":\n\n```\nimport pandas as pd\n\ntry:\n", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " df = pd.read_csv(\"/var/folders/rb/q", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "v8vwgyj6yjd3t4pwsy9", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "t0rm0000gn/T/tmpaxhbg_k1/Yk", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "D5cgQZinflation.csv\")\n print(df.head())\n print(df.info())\n ", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " print(df.describe())\nexcept FileNotFoundError:\n print(\"The file does not exist", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")\n```\n\nThis code will print \"", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "The file does not exist\" if the file is not", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " found, instead of raising an error.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 193 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 301 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 494 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Here is a csv, can you describe it?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"# User provided a file accessible to you at \\\"\"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "jd3t4pwsy9t0rm0000gn/T", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "/tmpaxhbg_k1/YkD5cgQZinflation", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "())\n\n# Print information about", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " the dataframe\nprint(df", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": ".info())\n\n# Print summary statistics of the dataframe\nprint(df.describe())", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmpaxhbg_k1/YkD5cgQZinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics of the dataframe\nprint(df.describe())" - }, - "call_id": "ae5e788f-1422-4f12-b465-a29043709fbb", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "code_interpreter" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 36 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 10 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 46 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:3e31d\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:90a49\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:14c27\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:3e31d\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:90a49\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "\")]", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "using LoRA in Torchtune" - }, - "call_id": "0e2bf2f6-0915-4d44-ad3b-6fd48d3df717", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 107 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 23 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 130 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:3e31d\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:14c27\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:90a49\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "start" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "I", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "'m ready to help. What's", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " your question about Torchtune?", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 75 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 25 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \"\\nYou can use code_interpreter to load and inspect it.\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Execute code\", \"parameters\": {\"code\": {\"default\": null, \"description\": \"The code to execute\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"code_interpreter\"}}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " applying LoRA to all linear layers in the self-attention", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -48969,8 +44850,13 @@ "data": { "event": { "delta": { - "text": ", increasing the rank, and scaling alpha", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -48989,8 +44875,13 @@ "data": { "event": { "delta": { - "text": " and rank together.\n\nNote: You need to have the pre-trained Llama2 weights and tokenizer downloaded", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49009,8 +44900,13 @@ "data": { "event": { "delta": { - "text": " and installed before running the LoRA finetune. Additionally,", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "_csv(\"/var/folders/rb/qv8vwgyj6y", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49029,8 +44925,13 @@ "data": { "event": { "delta": { - "text": " you may need to modify the config file to point to", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "jd3t4pwsy9t0rm0000gn/T", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49049,8 +44950,13 @@ "data": { "event": { "delta": { - "text": " the location of your Llama2 weights and tokenizer.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "/tmp2x_sml66/9vYvmVRoinflation.csv", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49069,94 +44975,18 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "prompt_tokens", - "span_id": "4uwx07lA", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:34.698983+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 146 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "completion_tokens", - "span_id": "4uwx07lA", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:34.699031+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 296 - }, - { - "attributes": { - "model_id": "meta-llama/Llama-3.3-70B-Instruct", - "provider_id": "fireworks" - }, - "metric": "total_tokens", - "span_id": "4uwx07lA", - "timestamp": { - "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:34.699038+00:00", - "__module__": "datetime" - }, - "trace_id": "8C2YTmRESTKZ0i1l", - "type": "metric", - "unit": "tokens", - "value": 442 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -49170,8 +45000,13 @@ "data": { "event": { "delta": { - "text": "[k", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "# Print information about the dataframe\nprint(df", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49190,8 +45025,13 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": ".info())\n\n# Print summary statistics about the dataframe\nprint(df.describe", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49210,8 +45050,13 @@ "data": { "event": { "delta": { - "text": "\")]", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "())", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49237,10 +45082,14 @@ }, "tool_call": { "arguments": { - "query": "using LoRA in Torchtune" + "code": "import pandas as pd\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/rb/qv8vwgyj6yjd3t4pwsy9t0rm0000gn/T/tmp2x_sml66/9vYvmVRoinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print information about the dataframe\nprint(df.info())\n\n# Print summary statistics about the dataframe\nprint(df.describe())" }, - "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", - "tool_name": "knowledge_search" + "call_id": "5bbfebeb-4360-4ef9-a9e2-4227a8e8c699", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "code_interpreter" + } }, "type": "tool_call" }, @@ -49287,16 +45136,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "vGtNmXNY", + "span_id": "AyEX3So6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673350+00:00", + "__datetime__": "2025-03-07T01:44:17.873486+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 107 + "value": 36 }, { "attributes": { @@ -49304,16 +45153,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "vGtNmXNY", + "span_id": "AyEX3So6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673375+00:00", + "__datetime__": "2025-03-07T01:44:17.873500+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 23 + "value": 10 }, { "attributes": { @@ -49321,16 +45170,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "vGtNmXNY", + "span_id": "AyEX3So6", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:32.673381+00:00", + "__datetime__": "2025-03-07T01:44:17.873503+00:00", "__module__": "datetime" }, - "trace_id": "8C2YTmRESTKZ0i1l", + "trace_id": "kNsljyzfQV2Cn4aZ", "type": "metric", "unit": "tokens", - "value": 130 + "value": 46 } ] } @@ -49338,7 +45187,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:20e5d\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:57768\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6e83e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:4fd22\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " using torchtune's Lo", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -49887,7 +45796,7 @@ "data": { "event": { "delta": { - "text": "RA recipe: `tune run --", + "text": "[k", "type": "text" }, "event_type": { @@ -49907,7 +45816,7 @@ "data": { "event": { "delta": { - "text": "nnodes 1 --nproc_per_node 2 lora_finet", + "text": "nowledge_search(query=\"using LoRA in Torchtune", "type": "text" }, "event_type": { @@ -49927,7 +45836,7 @@ "data": { "event": { "delta": { - "text": "une_distributed --config llama2/7B_lora`\n\nYou can", + "text": "\")]", "type": "text" }, "event_type": { @@ -49947,8 +45856,19 @@ "data": { "event": { "delta": { - "text": " also experiment with different LoRA configurations, such as applying Lo", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "ce4b06be-6e7f-45cf-9555-25398caaf4f1", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -49956,7 +45876,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -49967,53 +45891,94 @@ "data": { "event": { "delta": { - "text": "RA to all linear layers in the self-attention, increasing", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " the rank, and scaling alpha", - "type": "text" + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673350+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 107 }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673375+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 23 }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "vGtNmXNY", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:32.673381+00:00", + "__module__": "datetime" + }, + "trace_id": "8C2YTmRESTKZ0i1l", + "type": "metric", + "unit": "tokens", + "value": 130 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:42933\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:20e5d\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0cd43\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " and rank together.\n\nNote: You need to have the Llama2 weights", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -50027,7 +45992,7 @@ "data": { "event": { "delta": { - "text": " and tokenizer downloaded and installed, and you need to have the", + "text": "I", "type": "text" }, "event_type": { @@ -50047,7 +46012,7 @@ "data": { "event": { "delta": { - "text": " necessary dependencies installed, including torchtune", + "text": "'m ready to help. What's", "type": "text" }, "event_type": { @@ -50067,7 +46032,7 @@ "data": { "event": { "delta": { - "text": " and W&B.", + "text": " your question about Torchtune?", "type": "text" }, "event_type": { @@ -50104,19 +46069,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 146 + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179269+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 75 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 303 + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179301+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 25 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 449 + "span_id": "7n3WMt3R", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:31.179308+00:00", + "__module__": "datetime" + }, + "trace_id": "BLgI_VzNTCCRs_2T", + "type": "metric", + "unit": "tokens", + "value": 100 } ] } @@ -50124,7 +46125,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:57768\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6e83e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:57768\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:4fd22\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:6e83e\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "_model = lora_llama2_7b(lora_attn_modules", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -50312,7 +46293,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "=[\"q_proj\", \"v_proj\"])`\n3. Load", "type": "text" }, "event_type": { @@ -50332,7 +46313,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What's", + "text": " the pre-trained Llama2 weights into", "type": "text" }, "event_type": { @@ -50352,7 +46333,7 @@ "data": { "event": { "delta": { - "text": " your question about Torchtune?", + "text": " the LoRA model: `lora_model.load_state", "type": "text" }, "event_type": { @@ -50372,58 +46353,33 @@ "data": { "event": { "delta": { - "text": "", + "text": "_dict(base_model.state_dict(), strict=False)`\n4. Set only Lo", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 75 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 25 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"using LoRA in Torchtune\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a03f3\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\\n See :ref:`below` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n # Print the first layer's self-attention in the usual Llama2 model\\n >>> print(base_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n (pos_embeddings): RotaryPositionalEmbeddings()\\n )\\n\\n # Print the same for Llama2 with LoRA weights\\n >>> print(lora_model.layers[0].attn)\\n MultiHeadAttention(\\n (q_proj): LoRALinear(\\n (dropout): Dropout(p=0.0, inplace=False)\\n \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n # Assuming that base_model already has the pretrained Llama2 weights,\\n # this will directly load them into your LoRA model without any conversion necessary.\\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n :func:`validate_missing_and_unexpected_for_lora() `.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n # Fetch all params from the model that are associated with LoRA.\\n lora_params = get_adapter_params(lora_model)\\n\\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n set_trainable_params(lora_model, lora_params)\\n\\n # Print the total number of parameters\\n total_params = sum([p.numel() for p in lora_model.parameters()])\\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n print(\\n f\\\"\\\"\\\"\\n {total_params} total params,\\n {trainable_params}\\\" trainable params,\\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n \\\"\\\"\\\"\\n )\\n\\n 6742609920 total params,\\n 4194304 trainable params,\\n 0.06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe \", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " recipe: `tune run --nnodes 1 --", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -50757,7 +46774,7 @@ "data": { "event": { "delta": { - "text": "nproc_per_node 2 lora_finetune_distributed --config", + "text": "[k", "type": "text" }, "event_type": { @@ -50777,7 +46794,7 @@ "data": { "event": { "delta": { - "text": " llama2/7B_lora`\n\nYou can also experiment with different Lo", + "text": "nowledge_search(query=\"using LoRA in Torchtune", "type": "text" }, "event_type": { @@ -50797,7 +46814,7 @@ "data": { "event": { "delta": { - "text": "RA configurations, such as applying LoRA to all linear layers", + "text": "\")]", "type": "text" }, "event_type": { @@ -50817,8 +46834,19 @@ "data": { "event": { "delta": { - "text": " in the self-attention, increasing the rank, and", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "using LoRA in Torchtune" + }, + "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -50826,7 +46854,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -50837,33 +46869,94 @@ "data": { "event": { "delta": { - "text": " scaling alpha and rank together.\n\nNote: You need to", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "complete" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, - "metrics": null + "metrics": [ + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "prompt_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209198+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 108 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "completion_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209239+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 23 + }, + { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, + "metric": "total_tokens", + "span_id": "qLPBZlok", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:26.209247+00:00", + "__module__": "datetime" + }, + "trace_id": "7GQeegpgTI-gqjHp", + "type": "metric", + "unit": "tokens", + "value": 131 + } + ] } - }, + } + ], + "type": "generator" + }, + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "chunks": [ { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": " have the Llama2 weights and tokenizer downloaded and installed before running the", + "text": "", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "progress" + "value": "start" }, "logprobs": null, "stop_reason": null @@ -50877,7 +46970,7 @@ "data": { "event": { "delta": { - "text": " LoRA finetune. Additionally, you can use", + "text": "I", "type": "text" }, "event_type": { @@ -50897,7 +46990,7 @@ "data": { "event": { "delta": { - "text": " torchtune's `Wand", + "text": "'m ready to help. What's", "type": "text" }, "event_type": { @@ -50917,7 +47010,7 @@ "data": { "event": { "delta": { - "text": "BLogger` to generate loss curves and track your experiments", + "text": " your first question about Torchtune", "type": "text" }, "event_type": { @@ -50937,7 +47030,7 @@ "data": { "event": { "delta": { - "text": ".", + "text": "?", "type": "text" }, "event_type": { @@ -50979,16 +47072,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "IZ8Q_jX_", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:28.484818+00:00", + "__datetime__": "2025-03-07T01:45:23.525734+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 147 + "value": 75 }, { "attributes": { @@ -50996,16 +47089,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "IZ8Q_jX_", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:28.484914+00:00", + "__datetime__": "2025-03-07T01:45:23.525763+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 290 + "value": 26 }, { "attributes": { @@ -51013,16 +47106,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "IZ8Q_jX_", + "span_id": "mYTkxvK_", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:28.484922+00:00", + "__datetime__": "2025-03-07T01:45:23.525770+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "kpcdkZQ2SsSOh9Lw", "type": "metric", "unit": "tokens", - "value": 437 + "value": 101 } ] } @@ -51030,7 +47123,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help. What's your first question about Torchtune?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -51078,7 +47171,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"using LoRA in Torchtune", + "text": "nowledge_search(query=\"Tor", "type": "text" }, "event_type": { @@ -51098,7 +47191,7 @@ "data": { "event": { "delta": { - "text": "\")]", + "text": "chtune documentation\")]", "type": "text" }, "event_type": { @@ -51125,9 +47218,9 @@ }, "tool_call": { "arguments": { - "query": "using LoRA in Torchtune" + "query": "Torchtune documentation" }, - "call_id": "d45a488f-368a-4a3b-a2d9-8fde584fc8f8", + "call_id": "385cbde8-19e8-4c8b-84ca-b75050b3666b", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -51175,16 +47268,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "qLPBZlok", + "span_id": "-7YS2sLl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209198+00:00", + "__datetime__": "2025-03-07T01:45:30.668846+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 108 + "value": 39 }, { "attributes": { @@ -51192,16 +47285,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "qLPBZlok", + "span_id": "-7YS2sLl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209239+00:00", + "__datetime__": "2025-03-07T01:45:30.668859+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 23 + "value": 20 }, { "attributes": { @@ -51209,16 +47302,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "qLPBZlok", + "span_id": "-7YS2sLl", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:26.209247+00:00", + "__datetime__": "2025-03-07T01:45:30.668861+00:00", "__module__": "datetime" }, - "trace_id": "7GQeegpgTI-gqjHp", + "trace_id": "BLgI_VzNTCCRs_2T", "type": "metric", "unit": "tokens", - "value": 131 + "value": 59 } ] } @@ -51226,7 +47319,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8106c\\nContent: conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n from torchtune.datasets import chat_dataset\\n from torchtune.models.llama3 import llama3_tokenizer\\n\\n tokenizer = llama3_tokenizer(\\\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\\")\\n ds = chat_dataset(\\n tokenizer=tokenizer,\\n source=\\\"json\\\",\\n data_files=\\\"data/my_data.json\\\",\\n split=\\\"train\\\",\\n conversation_column=\\\"dialogue\\\",\\n conversation_style=\\\"sharegpt\\\",\\n )\\n\\n.. code-block:: yaml\\n\\n # In config\\n tokenizer:\\n _component_: torchtune.models.llama3.llama3_tokenizer\\n path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\\n\\n dataset:\\n _component_: torchtune.datasets.chat_dataset\\n source: json\\n data_files: data/my_data.json\\n split: train\\n conversation_column: dialogue\\n conversation_style: sharegpt\\n\\n.. note::\\n You can pass in any keyword argument for `load_dataset `_ into all our\\n Dataset classes and they will honor them. This is useful for common parameters\\n such as specifying the data split with :code:`split` or configuration with\\n :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations `_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\\n\\n.. grid:: 2\\n\\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n * What LoRA is and how it saves memory during finetuning\\n * An overview of LoRA components in torchtune\\n * How to run a LoRA finetune using torchtune\\n * How to experiment with different LoRA configurations\\n\\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n * Be familiar with :ref:`torchtune`\\n * Make sure to :ref:`install torchtune`\\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA `_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n If you're unfamiliar, check out these references for the `definition of rank `_\\n and discussion of `low-rank approximations `_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:a03f3\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n # Model Arguments\\n model:\\n _component_: lora_llama2_7b\\n lora_attn_modules: ['q_proj', 'v_proj']\\n lora_rank: 8\\n lora_alpha: 16\\n ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:0719d\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA ` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n model.apply_lora_to_mlp=True \\\\\\n model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n model.lora_rank=16 \\\\\\n model.lora_alpha=32 \\\\\\n model.use_dora=True \\\\\\n model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n model:\\n _component_: torchtune.models.lora_llama3_8b\\n apply_lora_to_mlp: True\\n lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n lora_rank: 16\\n lora_alpha: 32\\n use_dora: True\\n quantize_base: True\\n\\n\\n.. note::\\n\\n Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP `.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -51254,7 +47347,7 @@ "data": { "event": { "delta": { - "text": "I", + "text": "L", "type": "text" }, "event_type": { @@ -51274,7 +47367,7 @@ "data": { "event": { "delta": { - "text": "'m ready to help. What's", + "text": "lama3-8B uses grouped-query", "type": "text" }, "event_type": { @@ -51294,7 +47387,7 @@ "data": { "event": { "delta": { - "text": " your first question about Torchtune", + "text": " attention instead of", "type": "text" }, "event_type": { @@ -51314,7 +47407,7 @@ "data": { "event": { "delta": { - "text": "?", + "text": " the standard multi-head attention.", "type": "text" }, "event_type": { @@ -51356,16 +47449,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "mYTkxvK_", + "span_id": "1eIEdjPP", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525734+00:00", + "__datetime__": "2025-03-07T01:45:18.982970+00:00", "__module__": "datetime" }, - "trace_id": "kpcdkZQ2SsSOh9Lw", + "trace_id": "rNeuYcnxTSqrP6Dg", "type": "metric", "unit": "tokens", - "value": 75 + "value": 80 }, { "attributes": { @@ -51373,16 +47466,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "mYTkxvK_", + "span_id": "1eIEdjPP", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525763+00:00", + "__datetime__": "2025-03-07T01:45:18.983000+00:00", "__module__": "datetime" }, - "trace_id": "kpcdkZQ2SsSOh9Lw", + "trace_id": "rNeuYcnxTSqrP6Dg", "type": "metric", "unit": "tokens", - "value": 26 + "value": 28 }, { "attributes": { @@ -51390,16 +47483,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "mYTkxvK_", + "span_id": "1eIEdjPP", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:45:23.525770+00:00", + "__datetime__": "2025-03-07T01:45:18.983005+00:00", "__module__": "datetime" }, - "trace_id": "kpcdkZQ2SsSOh9Lw", + "trace_id": "rNeuYcnxTSqrP6Dg", "type": "metric", "unit": "tokens", - "value": 101 + "value": 108 } ] } @@ -51407,7 +47500,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -51435,7 +47528,7 @@ "data": { "event": { "delta": { - "text": "[k", + "text": "L", "type": "text" }, "event_type": { @@ -51455,7 +47548,7 @@ "data": { "event": { "delta": { - "text": "nowledge", + "text": "lama3-8B uses grouped-query attention instead of", "type": "text" }, "event_type": { @@ -51475,7 +47568,7 @@ "data": { "event": { "delta": { - "text": "_search(query=\"Torchtune documentation\")]", + "text": " the standard", "type": "text" }, "event_type": { @@ -51495,19 +47588,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Torchtune documentation" - }, - "call_id": "2ecaa92d-e752-498b-86d0-f6ee5c8b3131", - "tool_name": "knowledge_search" - }, - "type": "tool_call" + "text": " multi-head attention.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -51515,11 +47597,7 @@ "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, "metrics": null } @@ -51547,19 +47625,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 39 + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884663+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 80 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 20 + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884753+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 28 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 59 + "span_id": "SlTnlfYc", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.884760+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", + "value": 108 } ] } @@ -51567,7 +47681,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -51587,126 +47701,21 @@ "stop_reason": null }, "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "L", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "lama3-8B uses grouped-query", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": " attention instead of the standard multi-head attention.", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 80 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 28 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 108 - } - ] - } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Llama3-8B attention type\"}, \"call_id\": \"\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:num-1\\nContent: 3 `_ is a new family of models released by Meta AI that improves upon the performance of the Llama2 family\\nof models across a `range of different benchmarks `_.\\nCurrently there are two different sizes of Meta Llama 3: 8B and 70B. In this tutorial we will focus on the 8B size model.\\nThere are a few main changes between Llama2-7B and Llama3-8B models:\\n\\n- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B\\n- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:num-1\\nContent: instead of 32,000 from Llama2 models)\\n- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)\\n- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B\\n- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_\\n\\n|\\n\\nGetting access to Llama3-8B-Instruct\\n------------------------------------\\n\\nFor this tutorial, we will be using the instruction-tuned version of Llama3-8B. First, let's download the model from Hugging Face. You will need to follow the instructions\\non the `official Meta page `_ to gain access to the model.\\nNext, make sure you grab your Hugging Face token from `here `_.\\n\\n\\n.. code-block:: bash\\n\\n tune download meta-llama/Meta-Llama-3\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:num-0\\nContent: :`download Llama3 Instruct weights `\\n\\n\\nTemplate changes from Llama2 to Llama3\\n--------------------------------------\\n\\nThe Llama2 chat model requires a specific template when prompting the pre-trained\\nmodel. Since the chat model was pretrained with this prompt template, if you want to run\\ninference on the model, you'll need to use the same template for optimal performance\\non chat data. Otherwise, the model will just perform standard text completion, which\\nmay or may not align with your intended use case.\\n\\nFrom the `official Llama2 prompt\\ntemplate guide `_\\nfor the Llama2 chat model, we can see that special tags are added:\\n\\n.. code-block:: text\\n\\n [INST] <>\\n You are a helpful, respectful, and honest assistant.\\n <>\\n\\n Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant \\n\\nLlama3 Instruct `overhauled `\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:num-0\\nContent: 'm Meta AI, your friendly AI assistant<|eot_id|>\\n\\nThe tags are entirely different, and they are actually encoded differently than in\\nLlama2. Let's walk through tokenizing an example with the Llama2 template and the\\nLlama3 template to understand how.\\n\\n.. note::\\n The Llama3 Base model uses a `different prompt template\\n `_ than Llama3 Instruct\\n because it has not yet been instruct tuned and the extra special tokens are untrained. If you\\n are running inference on the Llama3 Base model without fine-tuning we recommend the base\\n template for optimal performance. Generally, for instruct and chat data, we recommend using\\n Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using\\n Llama3 Instruct.\\n\\n.. _prompt_template_vs_special_tokens:\\n\\nTokenizing prompt templates & special tokens\\n--------------------------------------------\\n\\nLet's say I have a sample of a single user-assistant turn accompanied with a system\\nprompt:\\n\\n.. code-block:: python\\n\\n sample = [\\n {\\n \\\"role\\\": \\\"system\\\",\\n \\\"\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:num-3\\nContent: LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n # Build Llama2 without any LoRA layers\\n base_model = llama2_7b()\\n\\n # The default settings for lora_llama2_7b will match those for llama2_7b\\n # We just need to define which layers we want LoRA applied to.\\n # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n # layers outside of the self-attention.\\n lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n Calling :func:`lora_llama_2\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { - "chunks": [ + } + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": "[k", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -51720,7 +47729,7 @@ "data": { "event": { "delta": { - "text": "L", + "text": "nowledge_search(query=\"Llama3-8", "type": "text" }, "event_type": { @@ -51740,7 +47749,7 @@ "data": { "event": { "delta": { - "text": "lama3-8B uses grouped-query", + "text": "B attention type\")]", "type": "text" }, "event_type": { @@ -51760,8 +47769,19 @@ "data": { "event": { "delta": { - "text": " attention instead of the standard multi-head attention.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "Llama3-8B attention type" + }, + "call_id": "4901bbdf-8faf-4a57-b6f6-01688c6290e6", + "tool_name": "knowledge_search" + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -51769,7 +47789,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -51797,19 +47821,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 80 + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412559+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 40 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 28 + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412607+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 24 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 108 + "span_id": "DBPomV08", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:15.412615+00:00", + "__module__": "datetime" + }, + "trace_id": "rNeuYcnxTSqrP6Dg", + "type": "metric", + "unit": "tokens", + "value": 64 } ] } @@ -51817,7 +47877,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Insert documents into memory\", \"parameters\": {}, \"tool_name\": \"insert_into_memory\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -51865,7 +47925,7 @@ "data": { "event": { "delta": { - "text": "nowledge_search(query=\"Llama3-8", + "text": "nowledge_search(query=\"Llama3-8B attention", "type": "text" }, "event_type": { @@ -51885,7 +47945,7 @@ "data": { "event": { "delta": { - "text": "B attention type\")]", + "text": " type\")]", "type": "text" }, "event_type": { @@ -51914,7 +47974,7 @@ "arguments": { "query": "Llama3-8B attention type" }, - "call_id": "13953b92-bce0-463c-90ce-b2d9cca61e64", + "call_id": "dd056386-b105-47e5-bd85-07e5ae096de1", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -51957,18 +48017,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "yjKrmpeo", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.041566+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", "value": 40 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "yjKrmpeo", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.041591+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", "value": 24 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "yjKrmpeo", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:12.041597+00:00", + "__module__": "datetime" + }, + "trace_id": "liTx9auyTkyfvrBr", + "type": "metric", + "unit": "tokens", "value": 64 } ] @@ -51977,7 +48073,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Instead of the standard multi-head attention, what attention type does Llama3-8B use?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -52005,27 +48101,7 @@ "data": { "event": { "delta": { - "text": "[k", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "nowledge_search(query=\"Llama3-8B attention", + "text": "The", "type": "text" }, "event_type": { @@ -52045,7 +48121,7 @@ "data": { "event": { "delta": { - "text": " type\")]", + "text": " current CEO of Meta is Mark Zuckerberg.", "type": "text" }, "event_type": { @@ -52059,41 +48135,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "Llama3-8B attention type" - }, - "call_id": "0d27e59e-72dd-4976-8049-85a3e533e350", - "tool_name": "knowledge_search" - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -52117,19 +48158,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 40 + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084924+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1145 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 24 + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084934+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 19 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 64 + "span_id": "oB7hDf6E", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:07.084936+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 1164 } ] } @@ -52137,7 +48214,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79080546, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.6175132, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05570498, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -52165,8 +48242,13 @@ "data": { "event": { "delta": { - "text": "The", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "started" + }, + "tool_call": "", + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -52185,8 +48267,48 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", - "type": "text" + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "in_progress" + }, + "tool_call": "brave_search.call(query=\"current CEO of Meta\")", + "type": "tool_call" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "parse_status": { + "__enum__": "ToolCallParseStatus", + "__module__": "llama_stack.apis.common.content_types", + "value": "succeeded" + }, + "tool_call": { + "arguments": { + "query": "current CEO of Meta" + }, + "call_id": "535c272b-768b-44fe-b303-2eae022f67f5", + "tool_name": { + "__enum__": "BuiltinTool", + "__module__": "llama_stack.models.llama.datatypes", + "value": "brave_search" + } + }, + "type": "tool_call" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -52194,7 +48316,11 @@ "value": "progress" }, "logprobs": null, - "stop_reason": null + "stop_reason": { + "__enum__": "StopReason", + "__module__": "llama_stack.models.llama.datatypes", + "value": "end_of_turn" + } }, "metrics": null } @@ -52222,19 +48348,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 1235 + "span_id": "AZ60Ocso", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:03.907918+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 34 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 19 + "span_id": "AZ60Ocso", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:03.907933+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 10 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 1254 + "span_id": "AZ60Ocso", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:03.907936+00:00", + "__module__": "datetime" + }, + "trace_id": "hwA8OLUhQ1qa3ecF", + "type": "metric", + "unit": "tokens", + "value": 44 } ] } @@ -52242,7 +48404,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"current CEO of Meta\"}, \"call_id\": \"\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"{\\\"query\\\": \\\"current CEO of Meta\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\\\", \\\"score\\\": 0.8342047, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\\\u2018Boz\\\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\\\", \\\"score\\\": 0.8190992, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.79099923, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meet the Executive CSuite Team of Meta (Facebook) [2025]\\\", \\\"url\\\": \\\"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\\\", \\\"content\\\": \\\"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\\\u2019s finance and facilities team to keep track of the company\\\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\\\", \\\"score\\\": 0.7602419, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.05564338, \\\"raw_content\\\": null}]}\", \"role\": \"tool\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -52290,7 +48452,27 @@ "data": { "event": { "delta": { - "text": " current CEO of Meta is Mark Zuckerberg.", + "text": " boiling point of polyjuice is -100 degrees Celsius", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": ".", "type": "text" }, "event_type": { @@ -52332,16 +48514,16 @@ "provider_id": "fireworks" }, "metric": "prompt_tokens", - "span_id": "oB7hDf6E", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084924+00:00", + "__datetime__": "2025-03-07T02:04:33.852666+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 1145 + "value": 77 }, { "attributes": { @@ -52349,16 +48531,16 @@ "provider_id": "fireworks" }, "metric": "completion_tokens", - "span_id": "oB7hDf6E", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084934+00:00", + "__datetime__": "2025-03-07T02:04:33.852692+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 19 + "value": 23 }, { "attributes": { @@ -52366,16 +48548,16 @@ "provider_id": "fireworks" }, "metric": "total_tokens", - "span_id": "oB7hDf6E", + "span_id": "drZjZkfj", "timestamp": { "__class__": "datetime", - "__datetime__": "2025-03-07T01:44:07.084936+00:00", + "__datetime__": "2025-03-07T02:04:33.852699+00:00", "__module__": "datetime" }, - "trace_id": "hwA8OLUhQ1qa3ecF", + "trace_id": "Sn0I7GFHTxKxewK2", "type": "metric", "unit": "tokens", - "value": 1164 + "value": 100 } ] } @@ -52383,7 +48565,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the current CEO of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -52411,13 +48593,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "started" - }, - "tool_call": "", - "type": "tool_call" + "text": "The", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -52436,13 +48613,8 @@ "data": { "event": { "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "brave_search.call(query=\"current CEO of Meta\")", - "type": "tool_call" + "text": " boiling point of polyjuice is -100 degrees Celsius.", + "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", @@ -52455,45 +48627,6 @@ "metrics": null } }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "succeeded" - }, - "tool_call": { - "arguments": { - "query": "current CEO of Meta" - }, - "call_id": "94b2529c-a7f4-43ca-ab5c-92485d5cc6f3", - "tool_name": { - "__enum__": "BuiltinTool", - "__module__": "llama_stack.models.llama.datatypes", - "value": "brave_search" - } - }, - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": null - } - }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", @@ -52517,19 +48650,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, - "value": 34 + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.617998+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 77 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 10 + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.618030+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 23 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 44 + "span_id": "WMEZtUXH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:32.618036+00:00", + "__module__": "datetime" + }, + "trace_id": "f9RM1qaUTk2LvaVo", + "type": "metric", + "unit": "tokens", + "value": 100 } ] } @@ -52537,7 +48706,7 @@ ], "type": "generator" }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { + "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { "chunks": [ { "__module__": "llama_stack.apis.inference.inference", @@ -52585,7 +48754,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius.", + "text": " function get_boiling_point is not", "type": "text" }, "event_type": { @@ -52605,58 +48774,33 @@ "data": { "event": { "delta": { - "text": "", + "text": " able", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "complete" + "value": "progress" }, "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } + "stop_reason": null }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 77 - }, - { - "metric": "completion_tokens", - "unit": null, - "value": 23 - }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": " to find the", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -52670,7 +48814,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": " boiling point of \"polyjuice\" as", "type": "text" }, "event_type": { @@ -52690,7 +48834,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius.", + "text": " it", "type": "text" }, "event_type": { @@ -52710,58 +48854,33 @@ "data": { "event": { "delta": { - "text": "", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "complete" - }, - "logprobs": null, - "stop_reason": { - "__enum__": "StopReason", - "__module__": "llama_stack.models.llama.datatypes", - "value": "end_of_turn" - } - }, - "metrics": [ - { - "metric": "prompt_tokens", - "unit": null, - "value": 77 + "text": " is not a real liquid", + "type": "text" }, - { - "metric": "completion_tokens", - "unit": null, - "value": 23 + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" }, - { - "metric": "total_tokens", - "unit": null, - "value": 100 - } - ] + "logprobs": null, + "stop_reason": null + }, + "metrics": null } - } - ], - "type": "generator" - }, - "[[\"meta-llama/Llama-3.3-70B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": true, \"liquid_name\": \"polyjuice\"}, \"call_id\": \"\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"\", \"content\": \"-100\", \"role\": \"tool\", \"tool_name\": \"get_boiling_point\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": { - "chunks": [ + }, { "__module__": "llama_stack.apis.inference.inference", "__pydantic__": "ChatCompletionResponseStreamChunk", "data": { "event": { "delta": { - "text": "", + "text": ". Polyju", "type": "text" }, "event_type": { "__enum__": "ChatCompletionResponseEventType", "__module__": "llama_stack.apis.inference.inference", - "value": "start" + "value": "progress" }, "logprobs": null, "stop_reason": null @@ -52775,7 +48894,7 @@ "data": { "event": { "delta": { - "text": "The", + "text": "ice is a fictional substance from the", "type": "text" }, "event_type": { @@ -52795,7 +48914,7 @@ "data": { "event": { "delta": { - "text": " boiling point of polyjuice is -100 degrees Celsius.", + "text": " Harry Potter series.", "type": "text" }, "event_type": { @@ -52832,19 +48951,55 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232189+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", "value": 77 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, - "value": 23 + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232325+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 51 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, - "value": 100 + "span_id": "p7Vx9VAq", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:28.232334+00:00", + "__module__": "datetime" + }, + "trace_id": "WKEqFugATCeCl8mc", + "type": "metric", + "unit": "tokens", + "value": 128 } ] } @@ -53963,7 +50118,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice', cel", + "text": "get_boiling_point(liquid_name='polyjuice", "type": "text" }, "event_type": { @@ -53983,7 +50138,7 @@ "data": { "event": { "delta": { - "text": "cius=True)]", + "text": "', celcius=True)]", "type": "text" }, "event_type": { @@ -54013,7 +50168,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "20aad753-f01f-42cc-bb68-aac292a707a1", + "call_id": "d43b2636-903d-430d-8389-91eefe5a1d75", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -54056,18 +50211,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221646+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221673+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", "value": 28 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "9EBiVeAT", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:32.221680+00:00", + "__module__": "datetime" + }, + "trace_id": "7kB12OwpSUOcwmJV", + "type": "metric", + "unit": "tokens", "value": 58 } ] @@ -54174,7 +50365,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "d690dafc-3fe8-4356-ba40-5682803c9fbf", + "call_id": "0548b2ef-daa4-4099-bb2c-b34f00752339", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -54217,18 +50408,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366139+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366166+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", "value": 28 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "lc3YWIQH", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:00:31.366172+00:00", + "__module__": "datetime" + }, + "trace_id": "zDQV0rn3TNKfByA0", + "type": "metric", + "unit": "tokens", "value": 58 } ] @@ -54285,7 +50512,47 @@ "data": { "event": { "delta": { - "text": "juice is a fictional potion from the Harry Potter series by", + "text": "juice is a fictional potion from", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": " the Harry Potter series by J.K. Rowling. As it", + "type": "text" + }, + "event_type": { + "__enum__": "ChatCompletionResponseEventType", + "__module__": "llama_stack.apis.inference.inference", + "value": "progress" + }, + "logprobs": null, + "stop_reason": null + }, + "metrics": null + } + }, + { + "__module__": "llama_stack.apis.inference.inference", + "__pydantic__": "ChatCompletionResponseStreamChunk", + "data": { + "event": { + "delta": { + "text": "'s not a real substance, it doesn't have a boiling point", "type": "text" }, "event_type": { @@ -54305,7 +50572,7 @@ "data": { "event": { "delta": { - "text": " J.K. Rowling. As it's not a", + "text": ". Polyjuice Potion is a magical concoction", "type": "text" }, "event_type": { @@ -54325,7 +50592,7 @@ "data": { "event": { "delta": { - "text": " real substance, it doesn't have a boiling point. Polyjuice Potion is", + "text": " that allows the drinker to assume the form and", "type": "text" }, "event_type": { @@ -54345,7 +50612,7 @@ "data": { "event": { "delta": { - "text": " a magical concoction that allows the drinker to assume the form and", + "text": " appearance", "type": "text" }, "event_type": { @@ -54365,7 +50632,7 @@ "data": { "event": { "delta": { - "text": " appearance of another person, but it's not a physical substance that can be measured or analyzed in", + "text": " of another person, but it's not a physical substance that can", "type": "text" }, "event_type": { @@ -54385,7 +50652,7 @@ "data": { "event": { "delta": { - "text": " the same way as real-world", + "text": " be measured or analyzed in the same way as real-world", "type": "text" }, "event_type": { @@ -54425,7 +50692,7 @@ "data": { "event": { "delta": { - "text": " have any other questions or if there's anything else I can", + "text": " have any other questions or", "type": "text" }, "event_type": { @@ -54445,7 +50712,7 @@ "data": { "event": { "delta": { - "text": " help you with", + "text": " if there's anything else I can help you with, feel free to ask", "type": "text" }, "event_type": { @@ -54465,7 +50732,7 @@ "data": { "event": { "delta": { - "text": ", feel free to ask!", + "text": "!", "type": "text" }, "event_type": { @@ -54502,18 +50769,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "M0oC9v8Y", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:30.531648+00:00", + "__module__": "datetime" + }, + "trace_id": "0CMlh2kQShSVm3zE", + "type": "metric", + "unit": "tokens", "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "M0oC9v8Y", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:30.531666+00:00", + "__module__": "datetime" + }, + "trace_id": "0CMlh2kQShSVm3zE", + "type": "metric", + "unit": "tokens", "value": 113 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "M0oC9v8Y", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:30.531671+00:00", + "__module__": "datetime" + }, + "trace_id": "0CMlh2kQShSVm3zE", + "type": "metric", + "unit": "tokens", "value": 143 } ] @@ -54570,7 +50873,7 @@ "data": { "event": { "delta": { - "text": "get_boiling_point(liquid_name='polyjuice", + "text": "get_boiling_point(liquid_name='polyjuice', cel", "type": "text" }, "event_type": { @@ -54590,7 +50893,7 @@ "data": { "event": { "delta": { - "text": "', celcius=True)]", + "text": "cius=True)]", "type": "text" }, "event_type": { @@ -54620,7 +50923,7 @@ "celcius": true, "liquid_name": "polyjuice" }, - "call_id": "8671dbc5-2f70-48a3-a844-94ad7652a468", + "call_id": "acbb04a1-08f4-4277-9b66-aadda2fa2be7", "tool_name": "get_boiling_point" }, "type": "tool_call" @@ -54663,18 +50966,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175063+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", "value": 30 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175128+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", "value": 28 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "jMXDDKvp", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T02:04:26.175137+00:00", + "__module__": "datetime" + }, + "trace_id": "44TwzIrGS2aqfbVn", + "type": "metric", + "unit": "tokens", "value": 58 } ] @@ -54788,18 +51127,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404182+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", "value": 252 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404224+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", "value": 20 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "bxIams_G", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:13.404230+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", "value": 272 } ] @@ -54891,57 +51266,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "\n if n <=", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": " ", - "type": "tool_call" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "parse_status": { - "__enum__": "ToolCallParseStatus", - "__module__": "llama_stack.apis.common.content_types", - "value": "in_progress" - }, - "tool_call": "3:\n return True\n if n % 2 == 0", + "tool_call": "\n if n <= 3:\n return True", "type": "tool_call" }, "event_type": { @@ -54966,7 +51291,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " or n % 3 == 0:\n return False\n i", + "tool_call": "\n if n % 2 == 0 or n % 3", "type": "tool_call" }, "event_type": { @@ -54991,7 +51316,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " = 5\n while i * i <= n:\n if n", + "tool_call": " == 0:\n return False\n i = 5\n ", "type": "tool_call" }, "event_type": { @@ -55016,7 +51341,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " % i == 0 or n % (i + 2) ==", + "tool_call": " while i * i <= n:\n if n", "type": "tool_call" }, "event_type": { @@ -55041,7 +51366,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " 0:\n return False\n i += 6\n return", + "tool_call": " % i == 0 or n % (i", "type": "tool_call" }, "event_type": { @@ -55066,7 +51391,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " True\n\ndef nth_prime", + "tool_call": " + 2) == 0:\n return False\n i +=", "type": "tool_call" }, "event_type": { @@ -55091,7 +51416,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": "(n):\n count = 0\n num", + "tool_call": " 6\n return True\n\ndef nth_prime(n):\n count =", "type": "tool_call" }, "event_type": { @@ -55116,7 +51441,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " = 2\n while True:\n if is_prime(num):\n ", + "tool_call": " 0\n num = 2\n while True:\n if", "type": "tool_call" }, "event_type": { @@ -55141,7 +51466,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " count += 1\n", + "tool_call": " is_prime(num):\n count += 1\n if count == n", "type": "tool_call" }, "event_type": { @@ -55166,7 +51491,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " if count == n:\n return num\n", + "tool_call": ":\n return num\n num += 1\n\nprint(nth_prime", "type": "tool_call" }, "event_type": { @@ -55191,7 +51516,7 @@ "__module__": "llama_stack.apis.common.content_types", "value": "in_progress" }, - "tool_call": " num += 1\n\nprint(nth_prime(100))", + "tool_call": "(100))", "type": "tool_call" }, "event_type": { @@ -55220,7 +51545,7 @@ "arguments": { "code": "def is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n\ndef nth_prime(n):\n count = 0\n num = 2\n while True:\n if is_prime(num):\n count += 1\n if count == n:\n return num\n num += 1\n\nprint(nth_prime(100))" }, - "call_id": "836806e9-2184-4dac-9769-d94a496f9f95", + "call_id": "e1110bc1-dc83-480d-ad33-09d49f5ccc8d", "tool_name": { "__enum__": "BuiltinTool", "__module__": "llama_stack.models.llama.datatypes", @@ -55267,18 +51592,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "5J3hM-La", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:09.121100+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", "value": 40 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "5J3hM-La", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:09.121127+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", "value": 10 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "5J3hM-La", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:44:09.121132+00:00", + "__module__": "datetime" + }, + "trace_id": "snO106yxStaL10ow", + "type": "metric", + "unit": "tokens", "value": 50 } ] @@ -55335,27 +51696,7 @@ "data": { "event": { "delta": { - "text": "plexity the company was founded in 202", - "type": "text" - }, - "event_type": { - "__enum__": "ChatCompletionResponseEventType", - "__module__": "llama_stack.apis.inference.inference", - "value": "progress" - }, - "logprobs": null, - "stop_reason": null - }, - "metrics": null - } - }, - { - "__module__": "llama_stack.apis.inference.inference", - "__pydantic__": "ChatCompletionResponseStreamChunk", - "data": { - "event": { - "delta": { - "text": "2.", + "text": "plexity the company was founded in 2022.", "type": "text" }, "event_type": { @@ -55392,18 +51733,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "6jxCq3gU", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:50.430436+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", "value": 68 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "6jxCq3gU", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:50.430477+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", "value": 22 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "6jxCq3gU", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:50.430489+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", "value": 90 } ] @@ -55509,7 +51886,7 @@ "arguments": { "query": "Perplexity the company founding date" }, - "call_id": "3c2feef6-e21b-4715-83a5-61d24d09d7b5", + "call_id": "199ef050-bc11-4e4b-935d-f5241c3f40ef", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -55552,18 +51929,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "m4wMGuSN", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:49.880525+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", "value": 29 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "m4wMGuSN", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:49.880576+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", "value": 23 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "m4wMGuSN", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:49.880585+00:00", + "__module__": "datetime" + }, + "trace_id": "XhZWljYTTDCYF7vI", + "type": "metric", + "unit": "tokens", "value": 52 } ] @@ -55697,18 +52110,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "OyfVMRgR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:53.322420+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 63 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "OyfVMRgR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:53.322482+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 45 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "OyfVMRgR", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:53.322490+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 108 } ] @@ -55794,7 +52243,7 @@ "arguments": { "query": "NBA creation date" }, - "call_id": "2a770dea-edee-4890-b0e3-930a4cb167e3", + "call_id": "388e55ab-448a-4a98-905b-196c051bdeea", "tool_name": "knowledge_search" }, "type": "tool_call" @@ -55837,18 +52286,54 @@ }, "metrics": [ { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "prompt_tokens", - "unit": null, + "span_id": "QpFMmy3B", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:52.235138+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 27 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "completion_tokens", - "unit": null, + "span_id": "QpFMmy3B", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:52.235160+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 20 }, { + "attributes": { + "model_id": "meta-llama/Llama-3.3-70B-Instruct", + "provider_id": "fireworks" + }, "metric": "total_tokens", - "unit": null, + "span_id": "QpFMmy3B", + "timestamp": { + "__class__": "datetime", + "__datetime__": "2025-03-07T01:45:52.235165+00:00", + "__module__": "datetime" + }, + "trace_id": "TMrhR55CR-KrmGp0", + "type": "metric", + "unit": "tokens", "value": 47 } ] diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json index 0b9fbd6bb0..8db8ad9661 100644 --- a/tests/integration/fixtures/recorded_responses/invoke_tool.json +++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json @@ -64,19 +64,6 @@ } } }, - "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\n\\n# Load the CSV file\\ndf = pd.read_csv(\\\"\")\\n\\n# Print the first few rows of the dataframe\\nprint(df.head())\\n\\n# Print information about the dataframe\\nprint(df.info())\\n\\n# Print summary statistics of the dataframe\\nprint(df.describe())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { - "type": "value", - "value": { - "__module__": "llama_stack.apis.tools.tools", - "__pydantic__": "ToolInvocationResult", - "data": { - "content": "error\n[stdout]\n[Errno 2] No such file or directory: 'bwrap'\n[/stdout]\n[stderr]\n[Errno 2] No such file or directory: 'bwrap'\n[/stderr]", - "error_code": null, - "error_message": null, - "metadata": null - } - } - }, "[[], {\"kwargs\": {\"code\": \"import pandas as pd\\ndf = pd.read_csv(\\\"\")\\nprint(df.head())\", \"session_id\": \"\"}, \"tool_name\": \"code_interpreter\"}]": { "type": "value", "value": { @@ -457,7 +444,7 @@ "__module__": "llama_stack.apis.tools.tools", "__pydantic__": "ToolInvocationResult", "data": { - "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.8190992, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79080546, \"raw_content\": null}, {\"title\": \"Meet the Executive CSuite Team of Meta (Facebook) [2025]\", \"url\": \"https://digitaldefynd.com/IQ/meet-the-executive-csuite-team-of-meta-facebook/\", \"content\": \"Harvard University Executive Programs Free Harvard University Courses As a chief financial officer of Meta, Susan Li oversees the firm\\u2019s finance and facilities team to keep track of the company\\u2019s overall financial health. The chief operating officer of Meta, Javier Olivan, oversees the firm\\u2019s business team, infrastructure, and other products. Andrew Bosworth, called Boz, serves as chief technology officer at Meta and is responsible for leading the firm\\u2019s AR/VR organization, Reality Labs. Andrew has also served as engineering director to oversee events, mobile monetization, and feed ads and as VP of ads and business platforms to lead engineering, design, analytics, and product teams. Meta\\u2019s c-suite team comprises experienced and diverse executives, having extensive experience in technology, finance, legal, and all major industries.\", \"score\": 0.7602419, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.6175132, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05570498, \"raw_content\": null}]}", + "content": "{\"query\": \"current CEO of Meta\", \"top_k\": [{\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. Mark is responsible for setting the overall direction and product strategy for the company. He leads the design of Meta's services and development of its core technology and infrastructure. Mark studied computer science at Harvard\", \"score\": 0.8342047, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.79099923, \"raw_content\": null}, {\"title\": \"The 11 People Running Meta's $1 Trillion Social Media and ... - Observer\", \"url\": \"https://observer.com/2024/01/meta-facebook-top-executives/\", \"content\": \"Meta has one of the most stable leadership team in the tech industry. Almost all of Meta's top executives have been with the company for well over a decade. ... 39, cofounder, chairman and CEO\", \"score\": 0.45536873, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company.\", \"score\": 0.21026355, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.05564338, \"raw_content\": null}]}", "error_code": null, "error_message": null, "metadata": null @@ -476,23 +463,23 @@ "type": "text" }, { - "text": "Result 1:\nDocument_id:14c27\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW `_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune`.\n\n.. grid:: 2\n\n .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n * What LoRA is and how it saves memory during finetuning\n * An overview of LoRA components in torchtune\n * How to run a LoRA finetune using torchtune\n * How to experiment with different LoRA configurations\n\n .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n * Be familiar with :ref:`torchtune`\n * Make sure to :ref:`install torchtune`\n * Make sure you have downloaded the :ref:`Llama2-7B model weights`\n\nWhat is LoRA?\n-------------\n\n`LoRA `_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n If you're unfamiliar, check out these references for the `definition of rank `_\n and discussion of `low-rank approximations `_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", + "text": "Result 2:\nDocument_id:20e5d\nContent: LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n # Build Llama2 without any LoRA layers\n base_model = llama2_7b()\n\n # The default settings for lora_llama2_7b will match those for llama2_7b\n # We just need to define which layers we want LoRA applied to.\n # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n # layers outside of the self-attention.\n lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n Calling :func:`lora_llama_2_7b ` alone will not handle the definition of which parameters are trainable.\n See :ref:`below` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n # Print the first layer's self-attention in the usual Llama2 model\n >>> print(base_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n (pos_embeddings): RotaryPositionalEmbeddings()\n )\n\n # Print the same for Llama2 with LoRA weights\n >>> print(lora_model.layers[0].attn)\n MultiHeadAttention(\n (q_proj): LoRALinear(\n (dropout): Dropout(p=0.0, inplace=False)\n \n", "type": "text" }, { - "text": "Result 3:\nDocument_id:14c27\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", + "text": "Result 3:\nDocument_id:20e5d\nContent: 06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n # Model Arguments\n model:\n _component_: lora_llama2_7b\n lora_attn_modules: ['q_proj', 'v_proj']\n lora_rank: 8\n lora_alpha: 16\n ...\n\nWe see that the\n", "type": "text" }, { - "text": "Result 4:\nDocument_id:14c27\nContent: from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n # Assuming that base_model already has the pretrained Llama2 weights,\n # this will directly load them into your LoRA model without any conversion necessary.\n lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n :func:`validate_missing_and_unexpected_for_lora() `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe `.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n # Fetch all params from the model that are associated with LoRA.\n lora_params = get_adapter_params(lora_model)\n\n # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n set_trainable_params(lora_model, lora_params)\n\n # Print the total number of parameters\n total_params = sum([p.numel() for p in lora_model.parameters()])\n trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n print(\n f\"\"\"\n {total_params} total params,\n {trainable_params}\" trainable params,\n {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n \"\"\"\n )\n\n 6742609920 total params,\n 4194304 trainable params,\n 0.06% of all params are trainable.\n\n.. note::\n If you are directly using the LoRA recipe (as detailed :ref:`here`), you need only pass the\n relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe