Skip to content

Commit 2dfabd6

Browse files
authored
feat(mcp): Add per-tool-call metrics to logs episode response (#24389)
1 parent 6a093ec commit 2dfabd6

2 files changed

Lines changed: 332 additions & 34 deletions

File tree

pkg/cli/logs_episode.go

Lines changed: 86 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,45 +27,58 @@ type EpisodeEdge struct {
2727
EpisodeID string `json:"episode_id,omitempty"`
2828
}
2929

30+
// EpisodeToolCall represents a single MCP tool call within an episode.
31+
// It provides per-call observability for token consumption, latency, and error details.
32+
type EpisodeToolCall struct {
33+
Tool string `json:"tool"`
34+
Server string `json:"server"`
35+
Tokens int `json:"tokens"`
36+
DurationMS int64 `json:"duration_ms"`
37+
Status string `json:"status"`
38+
Error string `json:"error,omitempty"`
39+
}
40+
3041
// EpisodeData represents a deterministic episode rollup derived from workflow runs.
3142
type EpisodeData struct {
32-
EpisodeID string `json:"episode_id"`
33-
Kind string `json:"kind"`
34-
Confidence string `json:"confidence"`
35-
Reasons []string `json:"reasons,omitempty"`
36-
RootRunID int64 `json:"root_run_id,omitempty"`
37-
RunIDs []int64 `json:"run_ids"`
38-
WorkflowNames []string `json:"workflow_names"`
39-
PrimaryWorkflow string `json:"primary_workflow,omitempty"`
40-
TotalRuns int `json:"total_runs"`
41-
TotalTokens int `json:"total_tokens"`
42-
TotalEstimatedCost float64 `json:"total_estimated_cost"`
43-
TotalDuration string `json:"total_duration"`
44-
RiskyNodeCount int `json:"risky_node_count"`
45-
ChangedNodeCount int `json:"changed_node_count"`
46-
WriteCapableNodeCount int `json:"write_capable_node_count"`
47-
MissingToolCount int `json:"missing_tool_count"`
48-
MCPFailureCount int `json:"mcp_failure_count"`
49-
BlockedRequestCount int `json:"blocked_request_count"`
50-
LatestSuccessFallbackCount int `json:"latest_success_fallback_count"`
51-
NewMCPFailureRunCount int `json:"new_mcp_failure_run_count"`
52-
BlockedRequestIncreaseRunCount int `json:"blocked_request_increase_run_count"`
53-
ResourceHeavyNodeCount int `json:"resource_heavy_node_count"`
54-
PoorControlNodeCount int `json:"poor_control_node_count"`
55-
RiskDistribution string `json:"risk_distribution"`
56-
EscalationEligible bool `json:"escalation_eligible"`
57-
EscalationReason string `json:"escalation_reason,omitempty"`
58-
SuggestedRoute string `json:"suggested_route,omitempty"`
59-
Repository string `json:"repository,omitempty"`
60-
Organization string `json:"organization,omitempty"`
43+
EpisodeID string `json:"episode_id"`
44+
Kind string `json:"kind"`
45+
Confidence string `json:"confidence"`
46+
Reasons []string `json:"reasons,omitempty"`
47+
RootRunID int64 `json:"root_run_id,omitempty"`
48+
RunIDs []int64 `json:"run_ids"`
49+
WorkflowNames []string `json:"workflow_names"`
50+
PrimaryWorkflow string `json:"primary_workflow,omitempty"`
51+
TotalRuns int `json:"total_runs"`
52+
TotalTokens int `json:"total_tokens"`
53+
TotalEstimatedCost float64 `json:"total_estimated_cost"`
54+
TotalDuration string `json:"total_duration"`
55+
RiskyNodeCount int `json:"risky_node_count"`
56+
ChangedNodeCount int `json:"changed_node_count"`
57+
WriteCapableNodeCount int `json:"write_capable_node_count"`
58+
MissingToolCount int `json:"missing_tool_count"`
59+
MCPFailureCount int `json:"mcp_failure_count"`
60+
BlockedRequestCount int `json:"blocked_request_count"`
61+
LatestSuccessFallbackCount int `json:"latest_success_fallback_count"`
62+
NewMCPFailureRunCount int `json:"new_mcp_failure_run_count"`
63+
BlockedRequestIncreaseRunCount int `json:"blocked_request_increase_run_count"`
64+
ResourceHeavyNodeCount int `json:"resource_heavy_node_count"`
65+
PoorControlNodeCount int `json:"poor_control_node_count"`
66+
RiskDistribution string `json:"risk_distribution"`
67+
EscalationEligible bool `json:"escalation_eligible"`
68+
EscalationReason string `json:"escalation_reason,omitempty"`
69+
SuggestedRoute string `json:"suggested_route,omitempty"`
70+
Repository string `json:"repository,omitempty"`
71+
Organization string `json:"organization,omitempty"`
72+
ToolCalls []EpisodeToolCall `json:"tool_calls,omitempty"`
6173
}
6274

6375
type episodeAccumulator struct {
64-
metadata EpisodeData
65-
duration time.Duration
66-
runSet map[int64]bool
67-
nameSet map[string]bool
68-
rootTime time.Time
76+
metadata EpisodeData
77+
duration time.Duration
78+
runSet map[int64]bool
79+
nameSet map[string]bool
80+
rootTime time.Time
81+
toolCalls []EpisodeToolCall
6982
}
7083

7184
type episodeSeed struct {
@@ -175,6 +188,12 @@ func buildEpisodeData(runs []RunData, processedRuns []ProcessedRun) ([]EpisodeDa
175188
if pr.FirewallAnalysis != nil {
176189
acc.metadata.BlockedRequestCount += pr.FirewallAnalysis.BlockedRequests
177190
}
191+
// Collect per-tool-call metrics for this run.
192+
if pr.MCPToolUsage != nil {
193+
for _, tc := range pr.MCPToolUsage.ToolCalls {
194+
acc.toolCalls = append(acc.toolCalls, mcpToolCallToEpisodeToolCall(tc))
195+
}
196+
}
178197
}
179198
if !run.CreatedAt.IsZero() && (acc.metadata.RootRunID == 0 || run.CreatedAt.Before(acc.rootTime)) {
180199
acc.rootTime = run.CreatedAt
@@ -226,6 +245,19 @@ func buildEpisodeData(runs []RunData, processedRuns []ProcessedRun) ([]EpisodeDa
226245
}
227246
acc.metadata.EscalationEligible, acc.metadata.EscalationReason = classifyEpisodeEscalation(acc.metadata)
228247
acc.metadata.SuggestedRoute = buildSuggestedRoute(acc.metadata)
248+
if len(acc.toolCalls) > 0 {
249+
// Sort tool calls for deterministic output (server, then tool name, then status).
250+
slices.SortFunc(acc.toolCalls, func(a, b EpisodeToolCall) int {
251+
if a.Server != b.Server {
252+
return cmp.Compare(a.Server, b.Server)
253+
}
254+
if a.Tool != b.Tool {
255+
return cmp.Compare(a.Tool, b.Tool)
256+
}
257+
return cmp.Compare(a.Status, b.Status)
258+
})
259+
acc.metadata.ToolCalls = acc.toolCalls
260+
}
229261
episodes = append(episodes, acc.metadata)
230262
}
231263

@@ -246,6 +278,26 @@ func buildEpisodeData(runs []RunData, processedRuns []ProcessedRun) ([]EpisodeDa
246278
return episodes, edges
247279
}
248280

281+
// mcpToolCallToEpisodeToolCall converts an MCPToolCall record to the lightweight
282+
// EpisodeToolCall format used in episode rollups.
283+
// Token count is estimated from input/output byte sizes using CharsPerToken.
284+
// Duration is converted from a formatted string to milliseconds.
285+
func mcpToolCallToEpisodeToolCall(tc MCPToolCall) EpisodeToolCall {
286+
tokens := (tc.InputSize + tc.OutputSize) / CharsPerToken
287+
var durationMS int64
288+
if tc.Duration != "" {
289+
durationMS = parseDurationString(tc.Duration).Milliseconds()
290+
}
291+
return EpisodeToolCall{
292+
Tool: tc.ToolName,
293+
Server: tc.ServerName,
294+
Tokens: tokens,
295+
DurationMS: durationMS,
296+
Status: tc.Status,
297+
Error: tc.Error,
298+
}
299+
}
300+
249301
func findEpisodeParent(parents map[int64]int64, runID int64) int64 {
250302
parent, exists := parents[runID]
251303
if !exists || parent == runID {

pkg/cli/logs_episode_test.go

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
//go:build !integration
2+
3+
package cli
4+
5+
import (
6+
"testing"
7+
"time"
8+
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
)
12+
13+
func TestBuildEpisodeDataIncludesToolCalls(t *testing.T) {
14+
runs := []RunData{
15+
{
16+
DatabaseID: 101,
17+
WorkflowName: "my-workflow",
18+
Status: "completed",
19+
Conclusion: "success",
20+
TokenUsage: 1000,
21+
CreatedAt: time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
22+
},
23+
}
24+
processedRuns := []ProcessedRun{
25+
{
26+
Run: WorkflowRun{
27+
DatabaseID: 101,
28+
WorkflowName: "my-workflow",
29+
},
30+
MCPToolUsage: &MCPToolUsageData{
31+
ToolCalls: []MCPToolCall{
32+
{
33+
ServerName: "github",
34+
ToolName: "get_file_contents",
35+
InputSize: 400,
36+
OutputSize: 9200,
37+
Duration: "350ms",
38+
Status: "success",
39+
},
40+
{
41+
ServerName: "github",
42+
ToolName: "create_pull_request",
43+
InputSize: 200,
44+
OutputSize: 3000,
45+
Duration: "600ms",
46+
Status: "error",
47+
Error: "403 Resource not accessible by integration",
48+
},
49+
},
50+
},
51+
},
52+
}
53+
54+
episodes, _ := buildEpisodeData(runs, processedRuns)
55+
require.Len(t, episodes, 1, "expected one episode")
56+
57+
ep := episodes[0]
58+
require.Len(t, ep.ToolCalls, 2, "expected two tool calls in episode")
59+
60+
// Tool calls are sorted by server, then tool name. With server="github":
61+
// "create_pull_request" < "get_file_contents" alphabetically.
62+
63+
// First (alphabetically): create_pull_request — error call
64+
tc0 := ep.ToolCalls[0]
65+
assert.Equal(t, "create_pull_request", tc0.Tool, "tool name should match")
66+
assert.Equal(t, "github", tc0.Server, "server name should match")
67+
assert.Equal(t, (200+3000)/CharsPerToken, tc0.Tokens, "tokens should be estimated from sizes")
68+
assert.Equal(t, int64(600), tc0.DurationMS, "duration_ms should be 600")
69+
assert.Equal(t, "error", tc0.Status, "status should match")
70+
assert.Equal(t, "403 Resource not accessible by integration", tc0.Error, "error message should match")
71+
72+
// Second (alphabetically): get_file_contents — success call
73+
tc1 := ep.ToolCalls[1]
74+
assert.Equal(t, "get_file_contents", tc1.Tool, "tool name should match")
75+
assert.Equal(t, "github", tc1.Server, "server name should match")
76+
assert.Equal(t, (400+9200)/CharsPerToken, tc1.Tokens, "tokens should be estimated from sizes")
77+
assert.Equal(t, int64(350), tc1.DurationMS, "duration_ms should be 350")
78+
assert.Equal(t, "success", tc1.Status, "status should match")
79+
assert.Empty(t, tc1.Error, "no error expected")
80+
}
81+
82+
func TestBuildEpisodeDataNoToolCallsWhenMCPUsageAbsent(t *testing.T) {
83+
runs := []RunData{
84+
{
85+
DatabaseID: 200,
86+
WorkflowName: "no-mcp-workflow",
87+
Status: "completed",
88+
CreatedAt: time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
89+
},
90+
}
91+
processedRuns := []ProcessedRun{
92+
{
93+
Run: WorkflowRun{
94+
DatabaseID: 200,
95+
WorkflowName: "no-mcp-workflow",
96+
},
97+
MCPToolUsage: nil, // no MCP tool usage
98+
},
99+
}
100+
101+
episodes, _ := buildEpisodeData(runs, processedRuns)
102+
require.Len(t, episodes, 1, "expected one episode")
103+
104+
ep := episodes[0]
105+
assert.Empty(t, ep.ToolCalls, "tool_calls should be absent when no MCP usage data")
106+
}
107+
108+
func TestBuildEpisodeDataAggregatesToolCallsAcrossRuns(t *testing.T) {
109+
// Two runs belonging to the same episode (via dispatch)
110+
workflowCallID := "dispatch:wc-42"
111+
runs := []RunData{
112+
{
113+
DatabaseID: 301,
114+
WorkflowName: "orchestrator",
115+
Status: "completed",
116+
CreatedAt: time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
117+
AwContext: &AwContext{
118+
WorkflowCallID: "wc-42",
119+
},
120+
},
121+
{
122+
DatabaseID: 302,
123+
WorkflowName: "worker",
124+
Status: "completed",
125+
CreatedAt: time.Date(2024, 1, 1, 12, 1, 0, 0, time.UTC),
126+
AwContext: &AwContext{
127+
WorkflowCallID: "wc-42",
128+
},
129+
},
130+
}
131+
processedRuns := []ProcessedRun{
132+
{
133+
Run: WorkflowRun{DatabaseID: 301, WorkflowName: "orchestrator"},
134+
MCPToolUsage: &MCPToolUsageData{
135+
ToolCalls: []MCPToolCall{
136+
{
137+
ServerName: "github",
138+
ToolName: "search_code",
139+
InputSize: 100,
140+
OutputSize: 500,
141+
Duration: "200ms",
142+
Status: "success",
143+
},
144+
},
145+
},
146+
},
147+
{
148+
Run: WorkflowRun{DatabaseID: 302, WorkflowName: "worker"},
149+
MCPToolUsage: &MCPToolUsageData{
150+
ToolCalls: []MCPToolCall{
151+
{
152+
ServerName: "github",
153+
ToolName: "create_issue",
154+
InputSize: 50,
155+
OutputSize: 200,
156+
Duration: "400ms",
157+
Status: "success",
158+
},
159+
},
160+
},
161+
},
162+
}
163+
164+
episodes, _ := buildEpisodeData(runs, processedRuns)
165+
require.Len(t, episodes, 1, "expected one merged episode from two dispatch runs")
166+
167+
ep := episodes[0]
168+
assert.Equal(t, workflowCallID, ep.EpisodeID, "episode id should reflect dispatch call id")
169+
assert.Len(t, ep.ToolCalls, 2, "tool_calls should include calls from both runs")
170+
}
171+
172+
func TestMCPToolCallToEpisodeToolCall(t *testing.T) {
173+
tests := []struct {
174+
name string
175+
input MCPToolCall
176+
expectedTool string
177+
expectedServer string
178+
expectedTokens int
179+
expectedDurMS int64
180+
expectedStatus string
181+
expectedError string
182+
}{
183+
{
184+
name: "success call with duration",
185+
input: MCPToolCall{
186+
ServerName: "github",
187+
ToolName: "list_issues",
188+
InputSize: 400,
189+
OutputSize: 1200,
190+
Duration: "250ms",
191+
Status: "success",
192+
},
193+
expectedTool: "list_issues",
194+
expectedServer: "github",
195+
expectedTokens: (400 + 1200) / CharsPerToken,
196+
expectedDurMS: 250,
197+
expectedStatus: "success",
198+
},
199+
{
200+
name: "error call with error message",
201+
input: MCPToolCall{
202+
ServerName: "playwright",
203+
ToolName: "navigate",
204+
InputSize: 100,
205+
OutputSize: 0,
206+
Duration: "1s",
207+
Status: "error",
208+
Error: "timeout",
209+
},
210+
expectedTool: "navigate",
211+
expectedServer: "playwright",
212+
expectedTokens: 100 / CharsPerToken,
213+
expectedDurMS: 1000,
214+
expectedStatus: "error",
215+
expectedError: "timeout",
216+
},
217+
{
218+
name: "call without duration",
219+
input: MCPToolCall{
220+
ServerName: "github",
221+
ToolName: "get_repo",
222+
InputSize: 200,
223+
OutputSize: 800,
224+
Duration: "",
225+
Status: "success",
226+
},
227+
expectedTool: "get_repo",
228+
expectedServer: "github",
229+
expectedTokens: (200 + 800) / CharsPerToken,
230+
expectedDurMS: 0,
231+
expectedStatus: "success",
232+
},
233+
}
234+
235+
for _, tt := range tests {
236+
t.Run(tt.name, func(t *testing.T) {
237+
got := mcpToolCallToEpisodeToolCall(tt.input)
238+
assert.Equal(t, tt.expectedTool, got.Tool, "Tool should match")
239+
assert.Equal(t, tt.expectedServer, got.Server, "Server should match")
240+
assert.Equal(t, tt.expectedTokens, got.Tokens, "Tokens should be estimated from sizes")
241+
assert.Equal(t, tt.expectedDurMS, got.DurationMS, "DurationMS should match")
242+
assert.Equal(t, tt.expectedStatus, got.Status, "Status should match")
243+
assert.Equal(t, tt.expectedError, got.Error, "Error should match")
244+
})
245+
}
246+
}

0 commit comments

Comments
 (0)