Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 132 additions & 10 deletions pkg/cli/audit_diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,17 +260,44 @@ type MCPToolsDiffSummary struct {
AnomalyCount int `json:"anomaly_count"`
}

// TokenUsageDiff represents the detailed diff of token usage between two runs,
// based on the firewall proxy token-usage.jsonl data from RunSummary.TokenUsage.
type TokenUsageDiff struct {
Run1InputTokens int `json:"run1_input_tokens"`
Run2InputTokens int `json:"run2_input_tokens"`
InputTokensChange string `json:"input_tokens_change,omitempty"`
Run1OutputTokens int `json:"run1_output_tokens"`
Run2OutputTokens int `json:"run2_output_tokens"`
OutputTokensChange string `json:"output_tokens_change,omitempty"`
Run1CacheReadTokens int `json:"run1_cache_read_tokens"`
Run2CacheReadTokens int `json:"run2_cache_read_tokens"`
CacheReadTokensChange string `json:"cache_read_tokens_change,omitempty"`
Run1CacheWriteTokens int `json:"run1_cache_write_tokens"`
Run2CacheWriteTokens int `json:"run2_cache_write_tokens"`
CacheWriteTokensChange string `json:"cache_write_tokens_change,omitempty"`
Run1EffectiveTokens int `json:"run1_effective_tokens"`
Run2EffectiveTokens int `json:"run2_effective_tokens"`
EffectiveTokensChange string `json:"effective_tokens_change,omitempty"`
Run1TotalRequests int `json:"run1_total_requests"`
Run2TotalRequests int `json:"run2_total_requests"`
RequestsDelta string `json:"requests_delta,omitempty"` // Absolute request-count delta, e.g. "+4"
Run1CacheEfficiency float64 `json:"run1_cache_efficiency"`
Run2CacheEfficiency float64 `json:"run2_cache_efficiency"`
CacheEfficiencyChange string `json:"cache_efficiency_change,omitempty"` // Percentage-point delta, e.g. "+1.5pp"
}
Comment on lines +284 to +287
Copy link

Copilot AI Apr 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TokenUsageDiff includes per-run cache efficiency values, but there is no corresponding change field, so renderers currently leave the "Change" column blank for that row. If cache efficiency is intended to be part of the diff (per PR description), consider adding a CacheEfficiencyChange (percentage-point delta or relative %) or rendering it in a way that doesn't imply a missing value.

Copilot uses AI. Check for mistakes.

// RunMetricsDiff represents the diff of run-level metrics (token usage, duration, turns) between two runs
type RunMetricsDiff struct {
Run1TokenUsage int `json:"run1_token_usage"`
Run2TokenUsage int `json:"run2_token_usage"`
TokenUsageChange string `json:"token_usage_change,omitempty"` // e.g. "+15%", "-5%"
Run1Duration string `json:"run1_duration,omitempty"`
Run2Duration string `json:"run2_duration,omitempty"`
DurationChange string `json:"duration_change,omitempty"` // e.g. "+2m30s", "-1m"
Run1Turns int `json:"run1_turns,omitempty"`
Run2Turns int `json:"run2_turns,omitempty"`
TurnsChange int `json:"turns_change,omitempty"`
Run1TokenUsage int `json:"run1_token_usage"`
Run2TokenUsage int `json:"run2_token_usage"`
TokenUsageChange string `json:"token_usage_change,omitempty"` // e.g. "+15%", "-5%"
Run1Duration string `json:"run1_duration,omitempty"`
Run2Duration string `json:"run2_duration,omitempty"`
DurationChange string `json:"duration_change,omitempty"` // e.g. "+2m30s", "-1m"
Run1Turns int `json:"run1_turns,omitempty"`
Run2Turns int `json:"run2_turns,omitempty"`
TurnsChange int `json:"turns_change,omitempty"`
TokenUsageDetails *TokenUsageDiff `json:"token_usage_details,omitempty"` // Detailed breakdown from firewall proxy
}

// AuditDiff is the top-level diff combining firewall behavior, MCP tool invocations,
Expand Down Expand Up @@ -429,20 +456,24 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
var run1Tokens, run2Tokens int
var run1Duration, run2Duration time.Duration
var run1Turns, run2Turns int
var tu1, tu2 *TokenUsageSummary

if summary1 != nil {
run1Tokens = summary1.Run.TokenUsage
run1Duration = summary1.Run.Duration
run1Turns = summary1.Run.Turns
tu1 = summary1.TokenUsage
}
if summary2 != nil {
run2Tokens = summary2.Run.TokenUsage
run2Duration = summary2.Run.Duration
run2Turns = summary2.Run.Turns
tu2 = summary2.TokenUsage
}

// Skip if there is no meaningful data
if run1Tokens == 0 && run2Tokens == 0 && run1Duration == 0 && run2Duration == 0 && run1Turns == 0 && run2Turns == 0 {
hasTokenDetails := tu1 != nil || tu2 != nil
if run1Tokens == 0 && run2Tokens == 0 && run1Duration == 0 && run2Duration == 0 && run1Turns == 0 && run2Turns == 0 && !hasTokenDetails {
return nil
}

Expand Down Expand Up @@ -473,9 +504,100 @@ func computeRunMetricsDiff(summary1, summary2 *RunSummary) *RunMetricsDiff {
}
}

diff.TokenUsageDetails = computeTokenUsageDiff(tu1, tu2)

return diff
}

// computeTokenUsageDiff computes a detailed diff of token usage between two runs using
// the firewall proxy token-usage.jsonl data (TokenUsageSummary). Returns nil when both
// summaries are nil.
func computeTokenUsageDiff(tu1, tu2 *TokenUsageSummary) *TokenUsageDiff {
if tu1 == nil && tu2 == nil {
return nil
}

var (
run1Input, run2Input int
run1Output, run2Output int
run1CacheRead, run2CacheRead int
run1CacheWrite, run2CacheWrite int
run1Effective, run2Effective int
run1Requests, run2Requests int
run1CacheEff, run2CacheEff float64
)

if tu1 != nil {
run1Input = tu1.TotalInputTokens
run1Output = tu1.TotalOutputTokens
run1CacheRead = tu1.TotalCacheReadTokens
run1CacheWrite = tu1.TotalCacheWriteTokens
run1Effective = tu1.TotalEffectiveTokens
run1Requests = tu1.TotalRequests
run1CacheEff = tu1.CacheEfficiency
}
if tu2 != nil {
run2Input = tu2.TotalInputTokens
run2Output = tu2.TotalOutputTokens
run2CacheRead = tu2.TotalCacheReadTokens
run2CacheWrite = tu2.TotalCacheWriteTokens
run2Effective = tu2.TotalEffectiveTokens
run2Requests = tu2.TotalRequests
run2CacheEff = tu2.CacheEfficiency
}

diff := &TokenUsageDiff{
Run1InputTokens: run1Input,
Run2InputTokens: run2Input,
Run1OutputTokens: run1Output,
Run2OutputTokens: run2Output,
Run1CacheReadTokens: run1CacheRead,
Run2CacheReadTokens: run2CacheRead,
Run1CacheWriteTokens: run1CacheWrite,
Run2CacheWriteTokens: run2CacheWrite,
Run1EffectiveTokens: run1Effective,
Run2EffectiveTokens: run2Effective,
Run1TotalRequests: run1Requests,
Run2TotalRequests: run2Requests,
Run1CacheEfficiency: run1CacheEff,
Run2CacheEfficiency: run2CacheEff,
}

if run1Input > 0 || run2Input > 0 {
diff.InputTokensChange = formatVolumeChange(run1Input, run2Input)
}
if run1Output > 0 || run2Output > 0 {
diff.OutputTokensChange = formatVolumeChange(run1Output, run2Output)
}
if run1CacheRead > 0 || run2CacheRead > 0 {
diff.CacheReadTokensChange = formatVolumeChange(run1CacheRead, run2CacheRead)
}
if run1CacheWrite > 0 || run2CacheWrite > 0 {
diff.CacheWriteTokensChange = formatVolumeChange(run1CacheWrite, run2CacheWrite)
}
if run1Effective > 0 || run2Effective > 0 {
diff.EffectiveTokensChange = formatVolumeChange(run1Effective, run2Effective)
}
if run1Requests > 0 || run2Requests > 0 {
diff.RequestsDelta = formatCountChange(run1Requests, run2Requests)
}
if run1CacheEff > 0 || run2CacheEff > 0 {
diff.CacheEfficiencyChange = formatPercentagePointChange(run1CacheEff, run2CacheEff)
}

return diff
}

// formatPercentagePointChange formats the change between two ratio values (0.0-1.0) as a
// percentage-point delta (e.g. "+1.5pp", "-2.3pp")
func formatPercentagePointChange(ratio1, ratio2 float64) string {
delta := (ratio2 - ratio1) * 100
if delta >= 0 {
return fmt.Sprintf("+%.1fpp", delta)
}
return fmt.Sprintf("%.1fpp", delta)
}

// formatCountChange formats the absolute change in a count value (e.g. "+3", "-1")
func formatCountChange(count1, count2 int) string {
delta := count2 - count1
Expand Down
136 changes: 82 additions & 54 deletions pkg/cli/audit_diff_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,54 @@ import (
// NewAuditDiffSubcommand creates the audit diff subcommand
func NewAuditDiffSubcommand() *cobra.Command {
cmd := &cobra.Command{
Use: "diff <run-id-1> <run-id-2>",
Short: "Compare behavior across two workflow runs",
Long: `Compare workflow run behavior between two workflow runs to detect policy regressions,
new unauthorized domains, behavioral drift, and changes in MCP tool usage or run metrics.
Use: "diff <base-run-id> <compare-run-id>...",
Short: "Compare behavior across workflow runs",
Long: `Compare workflow run behavior between a base run and one or more comparison runs
to detect policy regressions, new unauthorized domains, behavioral drift, and changes in
MCP tool usage, token usage, or run metrics.

This command downloads artifacts for both runs (using cached data when available),
The first argument is the base (reference) run. All subsequent arguments are compared
against that base. This enables tracking behavioral drift across multiple runs at once.

This command downloads artifacts for all runs (using cached data when available),
analyzes their data, and produces a diff showing:
- New domains that appeared in the second run
- Removed domains that were in the first run but not the second
- New domains that appeared in the comparison run
- Removed domains that were in the base run but not the comparison
- Status changes (domains that flipped between allowed and denied)
- Volume changes (significant request count changes, >100% threshold)
- Anomaly flags (new denied domains, previously-denied now allowed)
- MCP tool invocation changes (new/removed tools, call count and error count diffs)
- Run metrics comparison (token usage, duration, turns) when cached data is available
- Detailed token usage breakdown (input/output/cache/effective tokens) from firewall proxy

Examples:
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 # Compare two runs
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --format markdown # Markdown output for PR comments
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --json # JSON for CI integration
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --repo owner/repo # Specify repository`,
Args: cobra.ExactArgs(2),
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 # Compare two runs
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 12347 12348 # Compare base against 3 runs
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --format markdown # Markdown output for PR comments
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --json # JSON for CI integration
` + string(constants.CLIExtensionPrefix) + ` audit diff 12345 12346 --repo owner/repo # Specify repository`,
Args: cobra.MinimumNArgs(2),
RunE: func(cmd *cobra.Command, args []string) error {
runID1, err := strconv.ParseInt(args[0], 10, 64)
baseRunID, err := strconv.ParseInt(args[0], 10, 64)
if err != nil {
return fmt.Errorf("invalid run ID %q: must be a numeric run ID", args[0])
return fmt.Errorf("invalid base run ID %q: must be a numeric run ID", args[0])
}
runID2, err := strconv.ParseInt(args[1], 10, 64)
if err != nil {
return fmt.Errorf("invalid run ID %q: must be a numeric run ID", args[1])

compareRunIDs := make([]int64, 0, len(args)-1)
seen := make(map[int64]bool)
for _, arg := range args[1:] {
id, err := strconv.ParseInt(arg, 10, 64)
if err != nil {
return fmt.Errorf("invalid run ID %q: must be a numeric run ID", arg)
}
if id == baseRunID {
return fmt.Errorf("comparison run ID %d is the same as the base run ID: cannot diff a run against itself", id)
}
if seen[id] {
return fmt.Errorf("duplicate comparison run ID %d: each run ID must appear only once", id)
}
seen[id] = true
compareRunIDs = append(compareRunIDs, id)
}

outputDir, _ := cmd.Flags().GetString("output")
Expand All @@ -62,7 +81,7 @@ Examples:
repo = parts[1]
}

return RunAuditDiff(cmd.Context(), runID1, runID2, owner, repo, hostname, outputDir, verbose, jsonOutput, format)
return RunAuditDiff(cmd.Context(), baseRunID, compareRunIDs, owner, repo, hostname, outputDir, verbose, jsonOutput, format)
},
}

Expand All @@ -74,9 +93,10 @@ Examples:
return cmd
}

// RunAuditDiff compares behavior between two workflow runs
func RunAuditDiff(ctx context.Context, runID1, runID2 int64, owner, repo, hostname, outputDir string, verbose, jsonOutput bool, format string) error {
auditDiffLog.Printf("Starting audit diff: run1=%d, run2=%d", runID1, runID2)
// RunAuditDiff compares behavior between a base workflow run and one or more comparison runs.
// The base run is the reference point; each comparison run is diffed against it independently.
func RunAuditDiff(ctx context.Context, baseRunID int64, compareRunIDs []int64, owner, repo, hostname, outputDir string, verbose, jsonOutput bool, format string) error {
auditDiffLog.Printf("Starting audit diff: base=%d, compare=%v", baseRunID, compareRunIDs)

// Auto-detect GHES host from git remote if hostname is not provided
if hostname == "" {
Expand All @@ -94,57 +114,65 @@ func RunAuditDiff(ctx context.Context, runID1, runID2 int64, owner, repo, hostna
default:
}

fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d → Run #%d", runID1, runID2)))
if len(compareRunIDs) == 1 {
fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d → Run #%d", baseRunID, compareRunIDs[0])))
} else {
fmt.Fprintln(os.Stderr, console.FormatInfoMessage(fmt.Sprintf("Comparing workflow runs: Run #%d (base) vs %d comparison runs", baseRunID, len(compareRunIDs))))
}

// Load run summaries for both runs
fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", runID1)))
summary1, err := loadRunSummaryForDiff(runID1, outputDir, owner, repo, hostname, verbose)
// Load base run summary once (shared across all comparisons)
fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for base run %d...", baseRunID)))
baseSummary, err := loadRunSummaryForDiff(baseRunID, outputDir, owner, repo, hostname, verbose)
if err != nil {
return fmt.Errorf("failed to load data for run %d: %w", runID1, err)
return fmt.Errorf("failed to load data for base run %d: %w", baseRunID, err)
}

// Check context cancellation between downloads
select {
case <-ctx.Done():
fmt.Fprintln(os.Stderr, console.FormatWarningMessage("Operation cancelled"))
return ctx.Err()
default:
}
diffs := make([]*AuditDiff, 0, len(compareRunIDs))

fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", runID2)))
summary2, err := loadRunSummaryForDiff(runID2, outputDir, owner, repo, hostname, verbose)
if err != nil {
return fmt.Errorf("failed to load data for run %d: %w", runID2, err)
}
for _, compareRunID := range compareRunIDs {
// Check context cancellation between downloads
select {
case <-ctx.Done():
fmt.Fprintln(os.Stderr, console.FormatWarningMessage("Operation cancelled"))
return ctx.Err()
default:
}

// Warn if no firewall data found
fw1 := summary1.FirewallAnalysis
fw2 := summary2.FirewallAnalysis
if fw1 == nil && fw2 == nil {
fmt.Fprintln(os.Stderr, console.FormatWarningMessage("No firewall data found in either run. Both runs may predate firewall logging."))
} else {
if fw1 == nil {
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d (older run may lack firewall logs)", runID1)))
fmt.Fprintln(os.Stderr, console.FormatProgressMessage(fmt.Sprintf("Loading data for run %d...", compareRunID)))
compareSummary, err := loadRunSummaryForDiff(compareRunID, outputDir, owner, repo, hostname, verbose)
if err != nil {
return fmt.Errorf("failed to load data for run %d: %w", compareRunID, err)
}
if fw2 == nil {
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d", runID2)))

// Warn if no firewall data found for this pair
fw1 := baseSummary.FirewallAnalysis
fw2 := compareSummary.FirewallAnalysis
if fw1 == nil && fw2 == nil {
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run pair %d→%d. Both runs may predate firewall logging.", baseRunID, compareRunID)))
} else {
if fw1 == nil {
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for base run %d (older run may lack firewall logs)", baseRunID)))
}
if fw2 == nil {
fmt.Fprintln(os.Stderr, console.FormatWarningMessage(fmt.Sprintf("No firewall data found for run %d", compareRunID)))
}
}
}

// Compute the full diff
diff := computeAuditDiff(runID1, runID2, summary1, summary2)
diff := computeAuditDiff(baseRunID, compareRunID, baseSummary, compareSummary)
diffs = append(diffs, diff)
}

// Render output
if jsonOutput || format == "json" {
return renderAuditDiffJSON(diff)
return renderAuditDiffJSON(diffs)
}

if format == "markdown" {
renderAuditDiffMarkdown(diff)
renderAuditDiffMarkdown(diffs)
return nil
}

// Default: pretty console output
renderAuditDiffPretty(diff)
renderAuditDiffPretty(diffs)
return nil
}
Loading