github · pelikhan · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/pkg/cli/audit_diff.go b/pkg/cli/audit_diff.go
@@ -260,17 +260,44 @@ type MCPToolsDiffSummary struct {
 	AnomalyCount     int  `json:"anomaly_count"`
 }
 
+// TokenUsageDiff represents the detailed diff of token usage between two runs,
+// based on the firewall proxy token-usage.jsonl data from RunSummary.TokenUsage.
+type TokenUsageDiff struct {
+	Run1InputTokens        int     `json:"run1_input_tokens"`
+	Run2InputTokens        int     `json:"run2_input_tokens"`
+	InputTokensChange      string  `json:"input_tokens_change,omitempty"`
+	Run1OutputTokens       int     `json:"run1_output_tokens"`
+	Run2OutputTokens       int     `json:"run2_output_tokens"`
+	OutputTokensChange     string  `json:"output_tokens_change,omitempty"`
+	Run1CacheReadTokens    int     `json:"run1_cache_read_tokens"`
+	Run2CacheReadTokens    int     `json:"run2_cache_read_tokens"`
+	CacheReadTokensChange  string  `json:"cache_read_tokens_change,omitempty"`
+	Run1CacheWriteTokens   int     `json:"run1_cache_write_tokens"`
+	Run2CacheWriteTokens   int     `json:"run2_cache_write_tokens"`
+	CacheWriteTokensChange string  `json:"cache_write_tokens_change,omitempty"`
+	Run1EffectiveTokens    int     `json:"run1_effective_tokens"`
+	Run2EffectiveTokens    int     `json:"run2_effective_tokens"`
+	EffectiveTokensChange  string  `json:"effective_tokens_change,omitempty"`
+	Run1TotalRequests      int     `json:"run1_total_requests"`
+	Run2TotalRequests      int     `json:"run2_total_requests"`
+	RequestsDelta          string  `json:"requests_delta,omitempty"` // Absolute request-count delta, e.g. "+4"
+	Run1CacheEfficiency    float64 `json:"run1_cache_efficiency"`
+	Run2CacheEfficiency    float64 `json:"run2_cache_efficiency"`
+	CacheEfficiencyChange  string  `json:"cache_efficiency_change,omitempty"` // Percentage-point delta, e.g. "+1.5pp"
+}
+
 // RunMetricsDiff represents the diff of run-level metrics (token usage, duration, turns) between two runs
 type RunMetricsDiff struct {
-	Run1TokenUsage   int    `json:"run1_token_usage"`
-	Run2TokenUsage   int    `json:"run2_token_usage"`
-	TokenUsageChange string `json:"token_usage_change,omitempty"` // e.g. "+15%", "-5%"
-	Run1Duration     string `json:"run1_duration,omitempty"`
-	Run2Duration     string `json:"run2_duration,omitempty"`
-	DurationChange   string `json:"duration_change,omitempty"` // e.g. "+2m30s", "-1m"
-	Run1Turns        int    `json:"run1_turns,omitempty"`
-	Run2Turns        int    `json:"run2_turns,omitempty"`
-	TurnsChange      int    `json:"turns_change,omitempty"`
+	Run1TokenUsage    int             `json:"run1_token_usage"`
+	Run2TokenUsage    int             `json:"run2_token_usage"`
+	TokenUsageChange  string          `json:"token_usage_change,omitempty"` // e.g. "+15%", "-5%"
+	Run1Duration      string          `json:"run1_duration,omitempty"`
+	Run2Duration      string          `json:"run2_duration,omitempty"`
+	DurationChange    string          `json:"duration_change,omitempty"` // e.g. "+2m30s", "-1m"
+	Run1Turns         int             `json:"run1_turns,omitempty"`
+	Run2Turns         int             `json:"run2_turns,omitempty"`
+	TurnsChange       int             `json:"turns_change,omitempty"`
+	TokenUsageDetails *TokenUsageDiff `json:"token_usage_details,omitempty"` // Detailed breakdown from firewall proxy
 }
 
 // AuditDiff is the top-level diff combining firewall behavior, MCP tool invocations,
@@ -429,20 +456,24 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
 	var run1Tokens, run2Tokens int
 	var run1Duration, run2Duration time.Duration
 	var run1Turns, run2Turns int
+	var tu1, tu2 *TokenUsageSummary
 
 	if summary1 != nil {
 		run1Tokens = summary1.Run.TokenUsage
 		run1Duration = summary1.Run.Duration
 		run1Turns = summary1.Run.Turns
+		tu1 = summary1.TokenUsage
 	}
 	if summary2 != nil {
 		run2Tokens = summary2.Run.TokenUsage
 		run2Duration = summary2.Run.Duration
 		run2Turns = summary2.Run.Turns
+		tu2 = summary2.TokenUsage
 	}
 
 	// Skip if there is no meaningful data
-	if run1Tokens == 0 && run2Tokens == 0 && run1Duration == 0 && run2Duration == 0 && run1Turns == 0 && run2Turns == 0 {
+	hasTokenDetails := tu1 != nil || tu2 != nil
+	if run1Tokens == 0 && run2Tokens == 0 && run1Duration == 0 && run2Duration == 0 && run1Turns == 0 && run2Turns == 0 && !hasTokenDetails {
 		return nil
 	}
 
@@ -473,9 +504,100 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
 		}
 	}
 
+	diff.TokenUsageDetails = computeTokenUsageDiff(tu1, tu2)
+
 	return diff
 }
 
+// computeTokenUsageDiff computes a detailed diff of token usage between two runs using
+// the firewall proxy token-usage.jsonl data (TokenUsageSummary). Returns nil when both
+// summaries are nil.
+func computeTokenUsageDiff(tu1, tu2 *TokenUsageSummary) *TokenUsageDiff {
+	if tu1 == nil && tu2 == nil {
+		return nil
+	}
+
+	var (
+		run1Input, run2Input           int
+		run1Output, run2Output         int
+		run1CacheRead, run2CacheRead   int
+		run1CacheWrite, run2CacheWrite int
+		run1Effective, run2Effective   int
+		run1Requests, run2Requests     int
+		run1CacheEff, run2CacheEff     float64
+	)
+
+	if tu1 != nil {
+		run1Input = tu1.TotalInputTokens
+		run1Output = tu1.TotalOutputTokens
+		run1CacheRead = tu1.TotalCacheReadTokens
+		run1CacheWrite = tu1.TotalCacheWriteTokens
+		run1Effective = tu1.TotalEffectiveTokens
+		run1Requests = tu1.TotalRequests
+		run1CacheEff = tu1.CacheEfficiency
+	}
+	if tu2 != nil {
+		run2Input = tu2.TotalInputTokens
+		run2Output = tu2.TotalOutputTokens
+		run2CacheRead = tu2.TotalCacheReadTokens
+		run2CacheWrite = tu2.TotalCacheWriteTokens
+		run2Effective = tu2.TotalEffectiveTokens
+		run2Requests = tu2.TotalRequests
+		run2CacheEff = tu2.CacheEfficiency
+	}
+
+	diff := &TokenUsageDiff{
+		Run1InputTokens:      run1Input,
+		Run2InputTokens:      run2Input,
+		Run1OutputTokens:     run1Output,
+		Run2OutputTokens:     run2Output,
+		Run1CacheReadTokens:  run1CacheRead,
+		Run2CacheReadTokens:  run2CacheRead,
+		Run1CacheWriteTokens: run1CacheWrite,
+		Run2CacheWriteTokens: run2CacheWrite,
+		Run1EffectiveTokens:  run1Effective,
+		Run2EffectiveTokens:  run2Effective,
+		Run1TotalRequests:    run1Requests,
+		Run2TotalRequests:    run2Requests,
+		Run1CacheEfficiency:  run1CacheEff,
+		Run2CacheEfficiency:  run2CacheEff,
+	}
+
+	if run1Input > 0 || run2Input > 0 {
+		diff.InputTokensChange = formatVolumeChange(run1Input, run2Input)
+	}
+	if run1Output > 0 || run2Output > 0 {
+		diff.OutputTokensChange = formatVolumeChange(run1Output, run2Output)
+	}
+	if run1CacheRead > 0 || run2CacheRead > 0 {
+		diff.CacheReadTokensChange = formatVolumeChange(run1CacheRead, run2CacheRead)
+	}
+	if run1CacheWrite > 0 || run2CacheWrite > 0 {
+		diff.CacheWriteTokensChange = formatVolumeChange(run1CacheWrite, run2CacheWrite)
+	}
+	if run1Effective > 0 || run2Effective > 0 {
+		diff.EffectiveTokensChange = formatVolumeChange(run1Effective, run2Effective)
+	}
+	if run1Requests > 0 || run2Requests > 0 {
+		diff.RequestsDelta = formatCountChange(run1Requests, run2Requests)
+	}
+	if run1CacheEff > 0 || run2CacheEff > 0 {
+		diff.CacheEfficiencyChange = formatPercentagePointChange(run1CacheEff, run2CacheEff)
+	}
+
+	return diff
+}
+
+// formatPercentagePointChange formats the change between two ratio values (0.0-1.0) as a
+// percentage-point delta (e.g. "+1.5pp", "-2.3pp")
+func formatPercentagePointChange(ratio1, ratio2 float64) string {
+	delta := (ratio2 - ratio1) * 100
+	if delta >= 0 {
+		return fmt.Sprintf("+%.1fpp", delta)
+	}
+	return fmt.Sprintf("%.1fpp", delta)
+}
+
 // formatCountChange formats the absolute change in a count value (e.g. "+3", "-1")
 func formatCountChange(count1, count2 int) string {
 	delta := count2 - count1

diff --git a/pkg/cli/audit_diff_command.go b/pkg/cli/audit_diff_command.go
@@ -15,35 +15,54 @@ import (
 // NewAuditDiffSubcommand creates the audit diff subcommand
 func NewAuditDiffSubcommand() *cobra.Command {
 	cmd := &cobra.Command{
-		Use:   "diff <run-id-1> <run-id-2>",
-		Short: "Compare behavior across two workflow runs",
-		Long: `Compare workflow run behavior between two workflow runs to detect policy regressions,
-new unauthorized domains, behavioral drift, and changes in MCP tool usage or run metrics.
+		Use:   "diff <base-run-id> <compare-run-id>...",
+		Short: "Compare behavior across workflow runs",
+		Long: `Compare workflow run behavior between a base run and one or more comparison runs
+to detect policy regressions, new unauthorized domains, behavioral drift, and changes in
+MCP tool usage, token usage, or run metrics.
 
-This command downloads artifacts for both runs (using cached data when available),
+The first argument is the base (reference) run. All subsequent arguments are compared
+against that base. This enables tracking behavioral drift across multiple runs at once.
+
+This command downloads artifacts for all runs (using cached data when available),
 analyzes their data, and produces a diff showing:
-- New domains that appeared in the second run
-- Removed domains that were in the first run but not the second
+- New domains that appeared in the comparison run
+- Removed domains that were in the base run but not the comparison
 - Status changes (domains that flipped between allowed and denied)
 - Volume changes (significant request count changes, >100% threshold)
 - Anomaly flags (new denied domains, previously-denied now allowed)
 - MCP tool invocation changes (new/removed tools, call count and error count diffs)
 - Run metrics comparison (token usage, duration, turns) when cached data is available
+- Detailed token usage breakdown (input/output/cache/effective tokens) from firewall proxy
 
 Examples:
-  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346                     # Compare two runs
-  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --format markdown    # Markdown output for PR comments
-  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --json               # JSON for CI integration
-  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --repo owner/repo    # Specify repository`,
-		Args: cobra.ExactArgs(2),
+  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346                               # Compare two runs
+  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 12347 12348                   # Compare base against 3 runs
+  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --format markdown             # Markdown output for PR comments
+  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --json                        # JSON for CI integration
+  ` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --repo owner/repo             # Specify repository`,
+		Args: cobra.MinimumNArgs(2),
 		RunE: func(cmd *cobra.Command, args []string) error {
-			runID1, err := strconv.ParseInt(args[0], 10, 64)
+			baseRunID, err := strconv.ParseInt(args[0], 10, 64)
 			if err != nil {
-				return fmt.Errorf("invalid run ID %q: must be a numeric run ID", args[0])
+				return fmt.Errorf("invalid base run ID %q: must be a numeric run ID", args[0])
 			}
-			runID2, err := strconv.ParseInt(args[1], 10, 64)
-			if err != nil {
-				return fmt.Errorf("invalid run ID %q: must be a numeric run ID", args[1])
+
+			compareRunIDs := make([]int64, 0, len(args)-1)
+			seen := make(map[int64]bool)
+			for _, arg := range args[1:] {
+				id, err := strconv.ParseInt(arg, 10, 64)
+				if err != nil {
+					return fmt.Errorf("invalid run ID %q: must be a numeric run ID", arg)
+				}
+				if id == baseRunID {
+					return fmt.Errorf("comparison run ID %d is the same as the base run ID: cannot diff a run against itself", id)
+				}
+				if seen[id] {
+					return fmt.Errorf("duplicate comparison run ID %d: each run ID must appear only once", id)
+				}
+				seen[id] = true
+				compareRunIDs = append(compareRunIDs, id)
 			}
 
 			outputDir, _ := cmd.Flags().GetString("output")
@@ -62,7 +81,7 @@ Examples:
 				repo = parts[1]
 			}
 
-			return RunAuditDiff(cmd.Context(), runID1, runID2, owner, repo, hostname, outputDir, verbose, jsonOutput, format)
+			return RunAuditDiff(cmd.Context(), baseRunID, compareRunIDs, owner, repo, hostname, outputDir, verbose, jsonOutput, format)
 		},
 	}
 
@@ -74,9 +93,10 @@ Examples:
 	return cmd
 }
 
-// RunAuditDiff compares behavior between two workflow runs
-func RunAuditDiff(ctx context.Context, runID1, runID2 int64, owner, repo, hostname, outputDir string, verbose, jsonOutput bool, format string) error {
-	auditDiffLog.Printf("Starting audit diff: run1=%d, run2=%d", runID1, runID2)
+// RunAuditDiff compares behavior between a base workflow run and one or more comparison runs.
+// The base run is the reference point; each comparison run is diffed against it independently.
+func RunAuditDiff(ctx context.Context, baseRunID int64, compareRunIDs []int64, owner, repo, hostname, outputDir string, verbose, jsonOutput bool, format string) error {
+	auditDiffLog.Printf("Starting audit diff: base=%d, compare=%v", baseRunID, compareRunIDs)
 
 	// Auto-detect GHES host from git remote if hostname is not provided
 	if hostname == "" {
@@ -94,57 +114,65 @@ func RunAuditDiff(ctx context.Context, runID1, runID2 int64, owner, repo, hostna
 	default:
 	}
 
-	fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d → Run #%d", runID1, runID2)))
+	if len(compareRunIDs) == 1 {
+		fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d → Run #%d", baseRunID, compareRunIDs[0])))
+	} else {
+		fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d (base) vs %d comparison runs", baseRunID, len(compareRunIDs))))
+	}
 
-	// Load run summaries for both runs
-	fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", runID1)))
-	summary1, err := loadRunSummaryForDiff(runID1, outputDir, owner, repo, hostname, verbose)
+	// Load base run summary once (shared across all comparisons)
+	fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for base run %d...", baseRunID)))
+	baseSummary, err := loadRunSummaryForDiff(baseRunID, outputDir, owner, repo, hostname, verbose)
 	if err != nil {
-		return fmt.Errorf("failed to load data for run %d: %w", runID1, err)
+		return fmt.Errorf("failed to load data for base run %d: %w", baseRunID, err)
 	}
 
-	// Check context cancellation between downloads
-	select {
-	case <-ctx.Done():
-		fmt.Fprintln(os.Stderr, console.FormatWarningMessage("Operation cancelled"))
-		return ctx.Err()
-	default:
-	}
+	diffs := make([]*AuditDiff, 0, len(compareRunIDs))
 
-	fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", runID2)))
-	summary2, err := loadRunSummaryForDiff(runID2, outputDir, owner, repo, hostname, verbose)
-	if err != nil {
-		return fmt.Errorf("failed to load data for run %d: %w", runID2, err)
-	}
+	for _, compareRunID := range compareRunIDs {
+		// Check context cancellation between downloads
+		select {
+		case <-ctx.Done():
+			fmt.Fprintln(os.Stderr, console.FormatWarningMessage("Operation cancelled"))
+			return ctx.Err()
+		default:
+		}
 
-	// Warn if no firewall data found
-	fw1 := summary1.FirewallAnalysis
-	fw2 := summary2.FirewallAnalysis
-	if fw1 == nil && fw2 == nil {
-		fmt.Fprintln(os.Stderr, console.FormatWarningMessage("No firewall data found in either run. Both runs may predate firewall logging."))
-	} else {
-		if fw1 == nil {
-			fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d (older run may lack firewall logs)", runID1)))
+		fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", compareRunID)))
+		compareSummary, err := loadRunSummaryForDiff(compareRunID, outputDir, owner, repo, hostname, verbose)
+		if err != nil {
+			return fmt.Errorf("failed to load data for run %d: %w", compareRunID, err)
 		}
-		if fw2 == nil {
-			fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d", runID2)))
+
+		// Warn if no firewall data found for this pair
+		fw1 := baseSummary.FirewallAnalysis
+		fw2 := compareSummary.FirewallAnalysis
+		if fw1 == nil && fw2 == nil {
+			fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run pair %d→%d. Both runs may predate firewall logging.", baseRunID, compareRunID)))
+		} else {
+			if fw1 == nil {
+				fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for base run %d (older run may lack firewall logs)", baseRunID)))
+			}
+			if fw2 == nil {
+				fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d", compareRunID)))
+			}
 		}
-	}
 
-	// Compute the full diff
-	diff := computeAuditDiff(runID1, runID2, summary1, summary2)
+		diff := computeAuditDiff(baseRunID, compareRunID, baseSummary, compareSummary)
+		diffs = append(diffs, diff)
+	}
 
 	// Render output
 	if jsonOutput || format == "json" {
-		return renderAuditDiffJSON(diff)
+		return renderAuditDiffJSON(diffs)
 	}
 
 	if format == "markdown" {
-		renderAuditDiffMarkdown(diff)
+		renderAuditDiffMarkdown(diffs)
 		return nil
 	}
 
 	// Default: pretty console output
-	renderAuditDiffPretty(diff)
+	renderAuditDiffPretty(diffs)
 	return nil
 }