Skip to content

Commit cc344d1

Browse files
authored
feat: support token usage diff and multiple comparison runs in audit diff command (#24544)
1 parent 97fd0cc commit cc344d1

4 files changed

Lines changed: 732 additions & 118 deletions

File tree

pkg/cli/audit_diff.go

Lines changed: 132 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -260,17 +260,44 @@ type MCPToolsDiffSummary struct {
260260
AnomalyCount int `json:"anomaly_count"`
261261
}
262262

263+
// TokenUsageDiff represents the detailed diff of token usage between two runs,
264+
// based on the firewall proxy token-usage.jsonl data from RunSummary.TokenUsage.
265+
type TokenUsageDiff struct {
266+
Run1InputTokens int `json:"run1_input_tokens"`
267+
Run2InputTokens int `json:"run2_input_tokens"`
268+
InputTokensChange string `json:"input_tokens_change,omitempty"`
269+
Run1OutputTokens int `json:"run1_output_tokens"`
270+
Run2OutputTokens int `json:"run2_output_tokens"`
271+
OutputTokensChange string `json:"output_tokens_change,omitempty"`
272+
Run1CacheReadTokens int `json:"run1_cache_read_tokens"`
273+
Run2CacheReadTokens int `json:"run2_cache_read_tokens"`
274+
CacheReadTokensChange string `json:"cache_read_tokens_change,omitempty"`
275+
Run1CacheWriteTokens int `json:"run1_cache_write_tokens"`
276+
Run2CacheWriteTokens int `json:"run2_cache_write_tokens"`
277+
CacheWriteTokensChange string `json:"cache_write_tokens_change,omitempty"`
278+
Run1EffectiveTokens int `json:"run1_effective_tokens"`
279+
Run2EffectiveTokens int `json:"run2_effective_tokens"`
280+
EffectiveTokensChange string `json:"effective_tokens_change,omitempty"`
281+
Run1TotalRequests int `json:"run1_total_requests"`
282+
Run2TotalRequests int `json:"run2_total_requests"`
283+
RequestsDelta string `json:"requests_delta,omitempty"` // Absolute request-count delta, e.g. "+4"
284+
Run1CacheEfficiency float64 `json:"run1_cache_efficiency"`
285+
Run2CacheEfficiency float64 `json:"run2_cache_efficiency"`
286+
CacheEfficiencyChange string `json:"cache_efficiency_change,omitempty"` // Percentage-point delta, e.g. "+1.5pp"
287+
}
288+
263289
// RunMetricsDiff represents the diff of run-level metrics (token usage, duration, turns) between two runs
264290
type RunMetricsDiff struct {
265-
Run1TokenUsage int `json:"run1_token_usage"`
266-
Run2TokenUsage int `json:"run2_token_usage"`
267-
TokenUsageChange string `json:"token_usage_change,omitempty"` // e.g. "+15%", "-5%"
268-
Run1Duration string `json:"run1_duration,omitempty"`
269-
Run2Duration string `json:"run2_duration,omitempty"`
270-
DurationChange string `json:"duration_change,omitempty"` // e.g. "+2m30s", "-1m"
271-
Run1Turns int `json:"run1_turns,omitempty"`
272-
Run2Turns int `json:"run2_turns,omitempty"`
273-
TurnsChange int `json:"turns_change,omitempty"`
291+
Run1TokenUsage int `json:"run1_token_usage"`
292+
Run2TokenUsage int `json:"run2_token_usage"`
293+
TokenUsageChange string `json:"token_usage_change,omitempty"` // e.g. "+15%", "-5%"
294+
Run1Duration string `json:"run1_duration,omitempty"`
295+
Run2Duration string `json:"run2_duration,omitempty"`
296+
DurationChange string `json:"duration_change,omitempty"` // e.g. "+2m30s", "-1m"
297+
Run1Turns int `json:"run1_turns,omitempty"`
298+
Run2Turns int `json:"run2_turns,omitempty"`
299+
TurnsChange int `json:"turns_change,omitempty"`
300+
TokenUsageDetails *TokenUsageDiff `json:"token_usage_details,omitempty"` // Detailed breakdown from firewall proxy
274301
}
275302

276303
// AuditDiff is the top-level diff combining firewall behavior, MCP tool invocations,
@@ -429,20 +456,24 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
429456
var run1Tokens, run2Tokens int
430457
var run1Duration, run2Duration time.Duration
431458
var run1Turns, run2Turns int
459+
var tu1, tu2 *TokenUsageSummary
432460

433461
if summary1 != nil {
434462
run1Tokens = summary1.Run.TokenUsage
435463
run1Duration = summary1.Run.Duration
436464
run1Turns = summary1.Run.Turns
465+
tu1 = summary1.TokenUsage
437466
}
438467
if summary2 != nil {
439468
run2Tokens = summary2.Run.TokenUsage
440469
run2Duration = summary2.Run.Duration
441470
run2Turns = summary2.Run.Turns
471+
tu2 = summary2.TokenUsage
442472
}
443473

444474
// Skip if there is no meaningful data
445-
if run1Tokens == 0 && run2Tokens == 0 && run1Duration == 0 && run2Duration == 0 && run1Turns == 0 && run2Turns == 0 {
475+
hasTokenDetails := tu1 != nil || tu2 != nil
476+
if run1Tokens == 0 && run2Tokens == 0 && run1Duration == 0 && run2Duration == 0 && run1Turns == 0 && run2Turns == 0 && !hasTokenDetails {
446477
return nil
447478
}
448479

@@ -473,9 +504,100 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
473504
}
474505
}
475506

507+
diff.TokenUsageDetails = computeTokenUsageDiff(tu1, tu2)
508+
476509
return diff
477510
}
478511

512+
// computeTokenUsageDiff computes a detailed diff of token usage between two runs using
513+
// the firewall proxy token-usage.jsonl data (TokenUsageSummary). Returns nil when both
514+
// summaries are nil.
515+
func computeTokenUsageDiff(tu1, tu2 *TokenUsageSummary) *TokenUsageDiff {
516+
if tu1 == nil && tu2 == nil {
517+
return nil
518+
}
519+
520+
var (
521+
run1Input, run2Input int
522+
run1Output, run2Output int
523+
run1CacheRead, run2CacheRead int
524+
run1CacheWrite, run2CacheWrite int
525+
run1Effective, run2Effective int
526+
run1Requests, run2Requests int
527+
run1CacheEff, run2CacheEff float64
528+
)
529+
530+
if tu1 != nil {
531+
run1Input = tu1.TotalInputTokens
532+
run1Output = tu1.TotalOutputTokens
533+
run1CacheRead = tu1.TotalCacheReadTokens
534+
run1CacheWrite = tu1.TotalCacheWriteTokens
535+
run1Effective = tu1.TotalEffectiveTokens
536+
run1Requests = tu1.TotalRequests
537+
run1CacheEff = tu1.CacheEfficiency
538+
}
539+
if tu2 != nil {
540+
run2Input = tu2.TotalInputTokens
541+
run2Output = tu2.TotalOutputTokens
542+
run2CacheRead = tu2.TotalCacheReadTokens
543+
run2CacheWrite = tu2.TotalCacheWriteTokens
544+
run2Effective = tu2.TotalEffectiveTokens
545+
run2Requests = tu2.TotalRequests
546+
run2CacheEff = tu2.CacheEfficiency
547+
}
548+
549+
diff := &TokenUsageDiff{
550+
Run1InputTokens: run1Input,
551+
Run2InputTokens: run2Input,
552+
Run1OutputTokens: run1Output,
553+
Run2OutputTokens: run2Output,
554+
Run1CacheReadTokens: run1CacheRead,
555+
Run2CacheReadTokens: run2CacheRead,
556+
Run1CacheWriteTokens: run1CacheWrite,
557+
Run2CacheWriteTokens: run2CacheWrite,
558+
Run1EffectiveTokens: run1Effective,
559+
Run2EffectiveTokens: run2Effective,
560+
Run1TotalRequests: run1Requests,
561+
Run2TotalRequests: run2Requests,
562+
Run1CacheEfficiency: run1CacheEff,
563+
Run2CacheEfficiency: run2CacheEff,
564+
}
565+
566+
if run1Input > 0 || run2Input > 0 {
567+
diff.InputTokensChange = formatVolumeChange(run1Input, run2Input)
568+
}
569+
if run1Output > 0 || run2Output > 0 {
570+
diff.OutputTokensChange = formatVolumeChange(run1Output, run2Output)
571+
}
572+
if run1CacheRead > 0 || run2CacheRead > 0 {
573+
diff.CacheReadTokensChange = formatVolumeChange(run1CacheRead, run2CacheRead)
574+
}
575+
if run1CacheWrite > 0 || run2CacheWrite > 0 {
576+
diff.CacheWriteTokensChange = formatVolumeChange(run1CacheWrite, run2CacheWrite)
577+
}
578+
if run1Effective > 0 || run2Effective > 0 {
579+
diff.EffectiveTokensChange = formatVolumeChange(run1Effective, run2Effective)
580+
}
581+
if run1Requests > 0 || run2Requests > 0 {
582+
diff.RequestsDelta = formatCountChange(run1Requests, run2Requests)
583+
}
584+
if run1CacheEff > 0 || run2CacheEff > 0 {
585+
diff.CacheEfficiencyChange = formatPercentagePointChange(run1CacheEff, run2CacheEff)
586+
}
587+
588+
return diff
589+
}
590+
591+
// formatPercentagePointChange formats the change between two ratio values (0.0-1.0) as a
592+
// percentage-point delta (e.g. "+1.5pp", "-2.3pp")
593+
func formatPercentagePointChange(ratio1, ratio2 float64) string {
594+
delta := (ratio2 - ratio1) * 100
595+
if delta >= 0 {
596+
return fmt.Sprintf("+%.1fpp", delta)
597+
}
598+
return fmt.Sprintf("%.1fpp", delta)
599+
}
600+
479601
// formatCountChange formats the absolute change in a count value (e.g. "+3", "-1")
480602
func formatCountChange(count1, count2 int) string {
481603
delta := count2 - count1

pkg/cli/audit_diff_command.go

Lines changed: 82 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -15,35 +15,54 @@ import (
1515
// NewAuditDiffSubcommand creates the audit diff subcommand
1616
func NewAuditDiffSubcommand() *cobra.Command {
1717
cmd := &cobra.Command{
18-
Use: "diff <run-id-1> <run-id-2>",
19-
Short: "Compare behavior across two workflow runs",
20-
Long: `Compare workflow run behavior between two workflow runs to detect policy regressions,
21-
new unauthorized domains, behavioral drift, and changes in MCP tool usage or run metrics.
18+
Use: "diff <base-run-id> <compare-run-id>...",
19+
Short: "Compare behavior across workflow runs",
20+
Long: `Compare workflow run behavior between a base run and one or more comparison runs
21+
to detect policy regressions, new unauthorized domains, behavioral drift, and changes in
22+
MCP tool usage, token usage, or run metrics.
2223
23-
This command downloads artifacts for both runs (using cached data when available),
24+
The first argument is the base (reference) run. All subsequent arguments are compared
25+
against that base. This enables tracking behavioral drift across multiple runs at once.
26+
27+
This command downloads artifacts for all runs (using cached data when available),
2428
analyzes their data, and produces a diff showing:
25-
- New domains that appeared in the second run
26-
- Removed domains that were in the first run but not the second
29+
- New domains that appeared in the comparison run
30+
- Removed domains that were in the base run but not the comparison
2731
- Status changes (domains that flipped between allowed and denied)
2832
- Volume changes (significant request count changes, >100% threshold)
2933
- Anomaly flags (new denied domains, previously-denied now allowed)
3034
- MCP tool invocation changes (new/removed tools, call count and error count diffs)
3135
- Run metrics comparison (token usage, duration, turns) when cached data is available
36+
- Detailed token usage breakdown (input/output/cache/effective tokens) from firewall proxy
3237
3338
Examples:
34-
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 # Compare two runs
35-
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --format markdown # Markdown output for PR comments
36-
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --json # JSON for CI integration
37-
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --repo owner/repo # Specify repository`,
38-
Args: cobra.ExactArgs(2),
39+
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 # Compare two runs
40+
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 12347 12348 # Compare base against 3 runs
41+
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --format markdown # Markdown output for PR comments
42+
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --json # JSON for CI integration
43+
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --repo owner/repo # Specify repository`,
44+
Args: cobra.MinimumNArgs(2),
3945
RunE: func(cmd *cobra.Command, args []string) error {
40-
runID1, err := strconv.ParseInt(args[0], 10, 64)
46+
baseRunID, err := strconv.ParseInt(args[0], 10, 64)
4147
if err != nil {
42-
return fmt.Errorf("invalid run ID %q: must be a numeric run ID", args[0])
48+
return fmt.Errorf("invalid base run ID %q: must be a numeric run ID", args[0])
4349
}
44-
runID2, err := strconv.ParseInt(args[1], 10, 64)
45-
if err != nil {
46-
return fmt.Errorf("invalid run ID %q: must be a numeric run ID", args[1])
50+
51+
compareRunIDs := make([]int64, 0, len(args)-1)
52+
seen := make(map[int64]bool)
53+
for _, arg := range args[1:] {
54+
id, err := strconv.ParseInt(arg, 10, 64)
55+
if err != nil {
56+
return fmt.Errorf("invalid run ID %q: must be a numeric run ID", arg)
57+
}
58+
if id == baseRunID {
59+
return fmt.Errorf("comparison run ID %d is the same as the base run ID: cannot diff a run against itself", id)
60+
}
61+
if seen[id] {
62+
return fmt.Errorf("duplicate comparison run ID %d: each run ID must appear only once", id)
63+
}
64+
seen[id] = true
65+
compareRunIDs = append(compareRunIDs, id)
4766
}
4867

4968
outputDir, _ := cmd.Flags().GetString("output")
@@ -62,7 +81,7 @@ Examples:
6281
repo = parts[1]
6382
}
6483

65-
return RunAuditDiff(cmd.Context(), runID1, runID2, owner, repo, hostname, outputDir, verbose, jsonOutput, format)
84+
return RunAuditDiff(cmd.Context(), baseRunID, compareRunIDs, owner, repo, hostname, outputDir, verbose, jsonOutput, format)
6685
},
6786
}
6887

@@ -74,9 +93,10 @@ Examples:
7493
return cmd
7594
}
7695

77-
// RunAuditDiff compares behavior between two workflow runs
78-
func RunAuditDiff(ctx context.Context, runID1, runID2 int64, owner, repo, hostname, outputDir string, verbose, jsonOutput bool, format string) error {
79-
auditDiffLog.Printf("Starting audit diff: run1=%d, run2=%d", runID1, runID2)
96+
// RunAuditDiff compares behavior between a base workflow run and one or more comparison runs.
97+
// The base run is the reference point; each comparison run is diffed against it independently.
98+
func RunAuditDiff(ctx context.Context, baseRunID int64, compareRunIDs []int64, owner, repo, hostname, outputDir string, verbose, jsonOutput bool, format string) error {
99+
auditDiffLog.Printf("Starting audit diff: base=%d, compare=%v", baseRunID, compareRunIDs)
80100

81101
// Auto-detect GHES host from git remote if hostname is not provided
82102
if hostname == "" {
@@ -94,57 +114,65 @@ func RunAuditDiff(ctx context.Context, runID1, runID2 int64, owner, repo, hostna
94114
default:
95115
}
96116

97-
fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d → Run #%d", runID1, runID2)))
117+
if len(compareRunIDs) == 1 {
118+
fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d → Run #%d", baseRunID, compareRunIDs[0])))
119+
} else {
120+
fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d (base) vs %d comparison runs", baseRunID, len(compareRunIDs))))
121+
}
98122

99-
// Load run summaries for both runs
100-
fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", runID1)))
101-
summary1, err := loadRunSummaryForDiff(runID1, outputDir, owner, repo, hostname, verbose)
123+
// Load base run summary once (shared across all comparisons)
124+
fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for base run %d...", baseRunID)))
125+
baseSummary, err := loadRunSummaryForDiff(baseRunID, outputDir, owner, repo, hostname, verbose)
102126
if err != nil {
103-
return fmt.Errorf("failed to load data for run %d: %w", runID1, err)
127+
return fmt.Errorf("failed to load data for base run %d: %w", baseRunID, err)
104128
}
105129

106-
// Check context cancellation between downloads
107-
select {
108-
case <-ctx.Done():
109-
fmt.Fprintln(os.Stderr, console.FormatWarningMessage("Operation cancelled"))
110-
return ctx.Err()
111-
default:
112-
}
130+
diffs := make([]*AuditDiff, 0, len(compareRunIDs))
113131

114-
fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", runID2)))
115-
summary2, err := loadRunSummaryForDiff(runID2, outputDir, owner, repo, hostname, verbose)
116-
if err != nil {
117-
return fmt.Errorf("failed to load data for run %d: %w", runID2, err)
118-
}
132+
for _, compareRunID := range compareRunIDs {
133+
// Check context cancellation between downloads
134+
select {
135+
case <-ctx.Done():
136+
fmt.Fprintln(os.Stderr, console.FormatWarningMessage("Operation cancelled"))
137+
return ctx.Err()
138+
default:
139+
}
119140

120-
// Warn if no firewall data found
121-
fw1 := summary1.FirewallAnalysis
122-
fw2 := summary2.FirewallAnalysis
123-
if fw1 == nil && fw2 == nil {
124-
fmt.Fprintln(os.Stderr, console.FormatWarningMessage("No firewall data found in either run. Both runs may predate firewall logging."))
125-
} else {
126-
if fw1 == nil {
127-
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d (older run may lack firewall logs)", runID1)))
141+
fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", compareRunID)))
142+
compareSummary, err := loadRunSummaryForDiff(compareRunID, outputDir, owner, repo, hostname, verbose)
143+
if err != nil {
144+
return fmt.Errorf("failed to load data for run %d: %w", compareRunID, err)
128145
}
129-
if fw2 == nil {
130-
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d", runID2)))
146+
147+
// Warn if no firewall data found for this pair
148+
fw1 := baseSummary.FirewallAnalysis
149+
fw2 := compareSummary.FirewallAnalysis
150+
if fw1 == nil && fw2 == nil {
151+
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run pair %d→%d. Both runs may predate firewall logging.", baseRunID, compareRunID)))
152+
} else {
153+
if fw1 == nil {
154+
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for base run %d (older run may lack firewall logs)", baseRunID)))
155+
}
156+
if fw2 == nil {
157+
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d", compareRunID)))
158+
}
131159
}
132-
}
133160

134-
// Compute the full diff
135-
diff := computeAuditDiff(runID1, runID2, summary1, summary2)
161+
diff := computeAuditDiff(baseRunID, compareRunID, baseSummary, compareSummary)
162+
diffs = append(diffs, diff)
163+
}
136164

137165
// Render output
138166
if jsonOutput || format == "json" {
139-
return renderAuditDiffJSON(diff)
167+
return renderAuditDiffJSON(diffs)
140168
}
141169

142170
if format == "markdown" {
143-
renderAuditDiffMarkdown(diff)
171+
renderAuditDiffMarkdown(diffs)
144172
return nil
145173
}
146174

147175
// Default: pretty console output
148-
renderAuditDiffPretty(diff)
176+
renderAuditDiffPretty(diffs)
149177
return nil
150178
}

0 commit comments

Comments
 (0)