feat(mcp): Add per-tool-call metrics to logs episode response (#24389)

Copilot · web-flow · commit 2dfabd658a3d · 2026-04-03T16:04:54.000-07:00
diff --git a/pkg/cli/logs_episode.go b/pkg/cli/logs_episode.go
@@ -27,45 +27,58 @@ type EpisodeEdge struct {
 	EpisodeID   string   `json:"episode_id,omitempty"`
 }
 
+// EpisodeToolCall represents a single MCP tool call within an episode.
+// It provides per-call observability for token consumption, latency, and error details.
+type EpisodeToolCall struct {
+	Tool       string `json:"tool"`
+	Server     string `json:"server"`
+	Tokens     int    `json:"tokens"`
+	DurationMS int64  `json:"duration_ms"`
+	Status     string `json:"status"`
+	Error      string `json:"error,omitempty"`
+}
+
 // EpisodeData represents a deterministic episode rollup derived from workflow runs.
 type EpisodeData struct {
-	EpisodeID                      string   `json:"episode_id"`
-	Kind                           string   `json:"kind"`
-	Confidence                     string   `json:"confidence"`
-	Reasons                        []string `json:"reasons,omitempty"`
-	RootRunID                      int64    `json:"root_run_id,omitempty"`
-	RunIDs                         []int64  `json:"run_ids"`
-	WorkflowNames                  []string `json:"workflow_names"`
-	PrimaryWorkflow                string   `json:"primary_workflow,omitempty"`
-	TotalRuns                      int      `json:"total_runs"`
-	TotalTokens                    int      `json:"total_tokens"`
-	TotalEstimatedCost             float64  `json:"total_estimated_cost"`
-	TotalDuration                  string   `json:"total_duration"`
-	RiskyNodeCount                 int      `json:"risky_node_count"`
-	ChangedNodeCount               int      `json:"changed_node_count"`
-	WriteCapableNodeCount          int      `json:"write_capable_node_count"`
-	MissingToolCount               int      `json:"missing_tool_count"`
-	MCPFailureCount                int      `json:"mcp_failure_count"`
-	BlockedRequestCount            int      `json:"blocked_request_count"`
-	LatestSuccessFallbackCount     int      `json:"latest_success_fallback_count"`
-	NewMCPFailureRunCount          int      `json:"new_mcp_failure_run_count"`
-	BlockedRequestIncreaseRunCount int      `json:"blocked_request_increase_run_count"`
-	ResourceHeavyNodeCount         int      `json:"resource_heavy_node_count"`
-	PoorControlNodeCount           int      `json:"poor_control_node_count"`
-	RiskDistribution               string   `json:"risk_distribution"`
-	EscalationEligible             bool     `json:"escalation_eligible"`
-	EscalationReason               string   `json:"escalation_reason,omitempty"`
-	SuggestedRoute                 string   `json:"suggested_route,omitempty"`
-	Repository                     string   `json:"repository,omitempty"`
-	Organization                   string   `json:"organization,omitempty"`
+	EpisodeID                      string            `json:"episode_id"`
+	Kind                           string            `json:"kind"`
+	Confidence                     string            `json:"confidence"`
+	Reasons                        []string          `json:"reasons,omitempty"`
+	RootRunID                      int64             `json:"root_run_id,omitempty"`
+	RunIDs                         []int64           `json:"run_ids"`
+	WorkflowNames                  []string          `json:"workflow_names"`
+	PrimaryWorkflow                string            `json:"primary_workflow,omitempty"`
+	TotalRuns                      int               `json:"total_runs"`
+	TotalTokens                    int               `json:"total_tokens"`
+	TotalEstimatedCost             float64           `json:"total_estimated_cost"`
+	TotalDuration                  string            `json:"total_duration"`
+	RiskyNodeCount                 int               `json:"risky_node_count"`
+	ChangedNodeCount               int               `json:"changed_node_count"`
+	WriteCapableNodeCount          int               `json:"write_capable_node_count"`
+	MissingToolCount               int               `json:"missing_tool_count"`
+	MCPFailureCount                int               `json:"mcp_failure_count"`
+	BlockedRequestCount            int               `json:"blocked_request_count"`
+	LatestSuccessFallbackCount     int               `json:"latest_success_fallback_count"`
+	NewMCPFailureRunCount          int               `json:"new_mcp_failure_run_count"`
+	BlockedRequestIncreaseRunCount int               `json:"blocked_request_increase_run_count"`
+	ResourceHeavyNodeCount         int               `json:"resource_heavy_node_count"`
+	PoorControlNodeCount           int               `json:"poor_control_node_count"`
+	RiskDistribution               string            `json:"risk_distribution"`
+	EscalationEligible             bool              `json:"escalation_eligible"`
+	EscalationReason               string            `json:"escalation_reason,omitempty"`
+	SuggestedRoute                 string            `json:"suggested_route,omitempty"`
+	Repository                     string            `json:"repository,omitempty"`
+	Organization                   string            `json:"organization,omitempty"`
+	ToolCalls                      []EpisodeToolCall `json:"tool_calls,omitempty"`
 }
 
 type episodeAccumulator struct {
-	metadata EpisodeData
-	duration time.Duration
-	runSet   map[int64]bool
-	nameSet  map[string]bool
-	rootTime time.Time
+	metadata  EpisodeData
+	duration  time.Duration
+	runSet    map[int64]bool
+	nameSet   map[string]bool
+	rootTime  time.Time
+	toolCalls []EpisodeToolCall
 }
 
 type episodeSeed struct {
@@ -175,6 +188,12 @@ func buildEpisodeData(runs []RunData, processedRuns []ProcessedRun) ([]EpisodeDa
 			if pr.FirewallAnalysis != nil {
 				acc.metadata.BlockedRequestCount += pr.FirewallAnalysis.BlockedRequests
 			}
+			// Collect per-tool-call metrics for this run.
+			if pr.MCPToolUsage != nil {
+				for _, tc := range pr.MCPToolUsage.ToolCalls {
+					acc.toolCalls = append(acc.toolCalls, mcpToolCallToEpisodeToolCall(tc))
+				}
+			}
 		}
 		if !run.CreatedAt.IsZero() && (acc.metadata.RootRunID == 0 || run.CreatedAt.Before(acc.rootTime)) {
 			acc.rootTime = run.CreatedAt
@@ -226,6 +245,19 @@ func buildEpisodeData(runs []RunData, processedRuns []ProcessedRun) ([]EpisodeDa
 		}
 		acc.metadata.EscalationEligible, acc.metadata.EscalationReason = classifyEpisodeEscalation(acc.metadata)
 		acc.metadata.SuggestedRoute = buildSuggestedRoute(acc.metadata)
+		if len(acc.toolCalls) > 0 {
+			// Sort tool calls for deterministic output (server, then tool name, then status).
+			slices.SortFunc(acc.toolCalls, func(a, b EpisodeToolCall) int {
+				if a.Server != b.Server {
+					return cmp.Compare(a.Server, b.Server)
+				}
+				if a.Tool != b.Tool {
+					return cmp.Compare(a.Tool, b.Tool)
+				}
+				return cmp.Compare(a.Status, b.Status)
+			})
+			acc.metadata.ToolCalls = acc.toolCalls
+		}
 		episodes = append(episodes, acc.metadata)
 	}
 
@@ -246,6 +278,26 @@ func buildEpisodeData(runs []RunData, processedRuns []ProcessedRun) ([]EpisodeDa
 	return episodes, edges
 }
 
+// mcpToolCallToEpisodeToolCall converts an MCPToolCall record to the lightweight
+// EpisodeToolCall format used in episode rollups.
+// Token count is estimated from input/output byte sizes using CharsPerToken.
+// Duration is converted from a formatted string to milliseconds.
+func mcpToolCallToEpisodeToolCall(tc MCPToolCall) EpisodeToolCall {
+	tokens := (tc.InputSize + tc.OutputSize) / CharsPerToken
+	var durationMS int64
+	if tc.Duration != "" {
+		durationMS = parseDurationString(tc.Duration).Milliseconds()
+	}
+	return EpisodeToolCall{
+		Tool:       tc.ToolName,
+		Server:     tc.ServerName,
+		Tokens:     tokens,
+		DurationMS: durationMS,
+		Status:     tc.Status,
+		Error:      tc.Error,
+	}
+}
+
 func findEpisodeParent(parents map[int64]int64, runID int64) int64 {
 	parent, exists := parents[runID]
 	if !exists || parent == runID {
diff --git a/pkg/cli/logs_episode_test.go b/pkg/cli/logs_episode_test.go
@@ -0,0 +1,246 @@
+//go:build !integration
+
+package cli
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestBuildEpisodeDataIncludesToolCalls(t *testing.T) {
+	runs := []RunData{
+		{
+			DatabaseID:   101,
+			WorkflowName: "my-workflow",
+			Status:       "completed",
+			Conclusion:   "success",
+			TokenUsage:   1000,
+			CreatedAt:    time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
+		},
+	}
+	processedRuns := []ProcessedRun{
+		{
+			Run: WorkflowRun{
+				DatabaseID:   101,
+				WorkflowName: "my-workflow",
+			},
+			MCPToolUsage: &MCPToolUsageData{
+				ToolCalls: []MCPToolCall{
+					{
+						ServerName: "github",
+						ToolName:   "get_file_contents",
+						InputSize:  400,
+						OutputSize: 9200,
+						Duration:   "350ms",
+						Status:     "success",
+					},
+					{
+						ServerName: "github",
+						ToolName:   "create_pull_request",
+						InputSize:  200,
+						OutputSize: 3000,
+						Duration:   "600ms",
+						Status:     "error",
+						Error:      "403 Resource not accessible by integration",
+					},
+				},
+			},
+		},
+	}
+
+	episodes, _ := buildEpisodeData(runs, processedRuns)
+	require.Len(t, episodes, 1, "expected one episode")
+
+	ep := episodes[0]
+	require.Len(t, ep.ToolCalls, 2, "expected two tool calls in episode")
+
+	// Tool calls are sorted by server, then tool name. With server="github":
+	// "create_pull_request" < "get_file_contents" alphabetically.
+
+	// First (alphabetically): create_pull_request — error call
+	tc0 := ep.ToolCalls[0]
+	assert.Equal(t, "create_pull_request", tc0.Tool, "tool name should match")
+	assert.Equal(t, "github", tc0.Server, "server name should match")
+	assert.Equal(t, (200+3000)/CharsPerToken, tc0.Tokens, "tokens should be estimated from sizes")
+	assert.Equal(t, int64(600), tc0.DurationMS, "duration_ms should be 600")
+	assert.Equal(t, "error", tc0.Status, "status should match")
+	assert.Equal(t, "403 Resource not accessible by integration", tc0.Error, "error message should match")
+
+	// Second (alphabetically): get_file_contents — success call
+	tc1 := ep.ToolCalls[1]
+	assert.Equal(t, "get_file_contents", tc1.Tool, "tool name should match")
+	assert.Equal(t, "github", tc1.Server, "server name should match")
+	assert.Equal(t, (400+9200)/CharsPerToken, tc1.Tokens, "tokens should be estimated from sizes")
+	assert.Equal(t, int64(350), tc1.DurationMS, "duration_ms should be 350")
+	assert.Equal(t, "success", tc1.Status, "status should match")
+	assert.Empty(t, tc1.Error, "no error expected")
+}
+
+func TestBuildEpisodeDataNoToolCallsWhenMCPUsageAbsent(t *testing.T) {
+	runs := []RunData{
+		{
+			DatabaseID:   200,
+			WorkflowName: "no-mcp-workflow",
+			Status:       "completed",
+			CreatedAt:    time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
+		},
+	}
+	processedRuns := []ProcessedRun{
+		{
+			Run: WorkflowRun{
+				DatabaseID:   200,
+				WorkflowName: "no-mcp-workflow",
+			},
+			MCPToolUsage: nil, // no MCP tool usage
+		},
+	}
+
+	episodes, _ := buildEpisodeData(runs, processedRuns)
+	require.Len(t, episodes, 1, "expected one episode")
+
+	ep := episodes[0]
+	assert.Empty(t, ep.ToolCalls, "tool_calls should be absent when no MCP usage data")
+}
+
+func TestBuildEpisodeDataAggregatesToolCallsAcrossRuns(t *testing.T) {
+	// Two runs belonging to the same episode (via dispatch)
+	workflowCallID := "dispatch:wc-42"
+	runs := []RunData{
+		{
+			DatabaseID:   301,
+			WorkflowName: "orchestrator",
+			Status:       "completed",
+			CreatedAt:    time.Date(2024, 1, 1, 12, 0, 0, 0, time.UTC),
+			AwContext: &AwContext{
+				WorkflowCallID: "wc-42",
+			},
+		},
+		{
+			DatabaseID:   302,
+			WorkflowName: "worker",
+			Status:       "completed",
+			CreatedAt:    time.Date(2024, 1, 1, 12, 1, 0, 0, time.UTC),
+			AwContext: &AwContext{
+				WorkflowCallID: "wc-42",
+			},
+		},
+	}
+	processedRuns := []ProcessedRun{
+		{
+			Run: WorkflowRun{DatabaseID: 301, WorkflowName: "orchestrator"},
+			MCPToolUsage: &MCPToolUsageData{
+				ToolCalls: []MCPToolCall{
+					{
+						ServerName: "github",
+						ToolName:   "search_code",
+						InputSize:  100,
+						OutputSize: 500,
+						Duration:   "200ms",
+						Status:     "success",
+					},
+				},
+			},
+		},
+		{
+			Run: WorkflowRun{DatabaseID: 302, WorkflowName: "worker"},
+			MCPToolUsage: &MCPToolUsageData{
+				ToolCalls: []MCPToolCall{
+					{
+						ServerName: "github",
+						ToolName:   "create_issue",
+						InputSize:  50,
+						OutputSize: 200,
+						Duration:   "400ms",
+						Status:     "success",
+					},
+				},
+			},
+		},
+	}
+
+	episodes, _ := buildEpisodeData(runs, processedRuns)
+	require.Len(t, episodes, 1, "expected one merged episode from two dispatch runs")
+
+	ep := episodes[0]
+	assert.Equal(t, workflowCallID, ep.EpisodeID, "episode id should reflect dispatch call id")
+	assert.Len(t, ep.ToolCalls, 2, "tool_calls should include calls from both runs")
+}
+
+func TestMCPToolCallToEpisodeToolCall(t *testing.T) {
+	tests := []struct {
+		name           string
+		input          MCPToolCall
+		expectedTool   string
+		expectedServer string
+		expectedTokens int
+		expectedDurMS  int64
+		expectedStatus string
+		expectedError  string
+	}{
+		{
+			name: "success call with duration",
+			input: MCPToolCall{
+				ServerName: "github",
+				ToolName:   "list_issues",
+				InputSize:  400,
+				OutputSize: 1200,
+				Duration:   "250ms",
+				Status:     "success",
+			},
+			expectedTool:   "list_issues",
+			expectedServer: "github",
+			expectedTokens: (400 + 1200) / CharsPerToken,
+			expectedDurMS:  250,
+			expectedStatus: "success",
+		},
+		{
+			name: "error call with error message",
+			input: MCPToolCall{
+				ServerName: "playwright",
+				ToolName:   "navigate",
+				InputSize:  100,
+				OutputSize: 0,
+				Duration:   "1s",
+				Status:     "error",
+				Error:      "timeout",
+			},
+			expectedTool:   "navigate",
+			expectedServer: "playwright",
+			expectedTokens: 100 / CharsPerToken,
+			expectedDurMS:  1000,
+			expectedStatus: "error",
+			expectedError:  "timeout",
+		},
+		{
+			name: "call without duration",
+			input: MCPToolCall{
+				ServerName: "github",
+				ToolName:   "get_repo",
+				InputSize:  200,
+				OutputSize: 800,
+				Duration:   "",
+				Status:     "success",
+			},
+			expectedTool:   "get_repo",
+			expectedServer: "github",
+			expectedTokens: (200 + 800) / CharsPerToken,
+			expectedDurMS:  0,
+			expectedStatus: "success",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := mcpToolCallToEpisodeToolCall(tt.input)
+			assert.Equal(t, tt.expectedTool, got.Tool, "Tool should match")
+			assert.Equal(t, tt.expectedServer, got.Server, "Server should match")
+			assert.Equal(t, tt.expectedTokens, got.Tokens, "Tokens should be estimated from sizes")
+			assert.Equal(t, tt.expectedDurMS, got.DurationMS, "DurationMS should match")
+			assert.Equal(t, tt.expectedStatus, got.Status, "Status should match")
+			assert.Equal(t, tt.expectedError, got.Error, "Error should match")
+		})
+	}
+}