From 5f4df34abce1fede5fedf08911bfe42c5f8b9e24 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Dec 2025 18:22:19 +0000 Subject: [PATCH 1/5] Pivot regression breakdown to matrix table format - Replace linear regression breakdown with 6x6 state transition matrix - States ordered: Pass, Skip, XFail, Fail, Error, Nonexistent - Improvements appear below diagonal, regressions above - Bold formatting for regressions, italic for improvements - Add Error as separate state (previously grouped with Fail) - Track error_tests separately in pytest, nunit, xunit workflows - Move discovery warnings section between matrix and detail lists - Keep collapsible detail sections for test names below matrix --- .github/workflows/regression-test.yml | 380 +++++++++++++++++--------- .github/workflows/test-cs-nunit.yml | 22 +- .github/workflows/test-cs-xunit.yml | 15 +- .github/workflows/test-py-pytest.yml | 11 +- 4 files changed, 289 insertions(+), 139 deletions(-) diff --git a/.github/workflows/regression-test.yml b/.github/workflows/regression-test.yml index 42dcd9c..65a9107 100644 --- a/.github/workflows/regression-test.yml +++ b/.github/workflows/regression-test.yml @@ -289,7 +289,12 @@ jobs: import json import os from pathlib import Path - from typing import List, TextIO + from typing import List, TextIO, Dict, Set, Tuple + + # States ordered from best to worst for matrix display + # This ordering puts improvements below diagonal, regressions above + STATES = ["Pass", "Skip", "XFail", "Fail", "Error", "Nonexistent"] + STATE_KEYS = ["passing", "skipped", "xfailed", "failing", "error", "nonexistent"] def load_json(path: str) -> dict: file_path = Path(path) @@ -330,8 +335,10 @@ jobs: status = status.lower() if status in {"passed", "pass"}: data["passing"].add(str(test_id)) - elif status in {"failed", "fail", "error"}: + elif status in {"failed", "fail"}: data["failing"].add(str(test_id)) + elif status == "error": + data["error"].add(str(test_id)) elif status in {"skipped", "skip"}: data["skipped"].add(str(test_id)) elif status in {"xfailed", "xfail"}: @@ -343,6 +350,7 @@ jobs: data = { "passing": set(coerce_list(raw.get("passing_tests"))), "failing": set(coerce_list(raw.get("failing_tests"))), + "error": set(coerce_list(raw.get("error_tests"))), "skipped": set(coerce_list(raw.get("skipped_tests"))), "xfailed": set(coerce_list(raw.get("xfailed_tests"))), "warnings": set(coerce_list(raw.get("warnings"))), @@ -356,6 +364,7 @@ jobs: data["all"].update( data["passing"] | data["failing"] + | data["error"] | data["skipped"] | data["xfailed"] | data["other"] @@ -363,50 +372,104 @@ jobs: return data + def classify_transition(from_state: str, to_state: str) -> str: + """Classify a state transition as regression, improvement, neutral, or new.""" + if from_state == to_state: + return "diagonal" + if from_state == "Nonexistent": + return "new" + + # Regressions (bold): moving to a worse state + # Anything to Fail or Error is regression + if to_state in ["Fail", "Error"]: + return "regression" + # Anything to Nonexistent is regression + if to_state == "Nonexistent": + return "regression" + # Pass to anything else is regression + if from_state == "Pass": + return "regression" + # Skip to Fail/Error already covered above + + # Neutrals + if from_state == "Fail" and to_state == "XFail": + return "neutral" + if from_state == "Skip" and to_state == "XFail": + return "neutral" + + # Improvements (italic): moving to a better state + if to_state == "Pass": + return "improvement" + if from_state in ["Fail", "Error"] and to_state == "Skip": + return "improvement" + if from_state == "Error" and to_state in ["XFail", "Fail"]: + return "improvement" + if from_state == "XFail" and to_state == "Skip": + return "improvement" + + return "neutral" + + def format_cell(count: int, classification: str) -> str: + """Format a cell value based on its classification.""" + if classification == "diagonal": + return "-" + if count == 0: + return "0" + if classification == "regression": + return f"**{count}**" + if classification == "improvement": + return f"*{count}*" + return str(count) + + def get_state_set(data: dict, state: str, all_tests: Set[str]) -> Set[str]: + """Get the set of tests in a given state.""" + state_map = { + "Pass": "passing", + "Skip": "skipped", + "XFail": "xfailed", + "Fail": "failing", + "Error": "error", + "Nonexistent": "nonexistent", + } + if state == "Nonexistent": + return all_tests - data["all"] + return data.get(state_map[state], set()) + baseline_data = build_status_sets(load_json("baseline_results.json")) current_data = build_status_sets(load_json("current_results.json")) - pass_to_fail = sorted(baseline_data["passing"] & current_data["failing"]) - pass_to_skip = sorted( - baseline_data["passing"] & (current_data["skipped"] | current_data["xfailed"]) - ) - pass_to_gone = sorted(baseline_data["passing"] - current_data["all"]) - fail_to_gone = sorted(baseline_data["failing"] - current_data["all"]) - discovery_regressions = sorted(current_data["warnings"] - baseline_data["warnings"]) - - fail_to_skip = sorted(baseline_data["failing"] & current_data["skipped"]) - fail_to_pass = sorted(baseline_data["failing"] & current_data["passing"]) - new_tests = sorted(current_data["all"] - baseline_data["all"]) - - regression_count = ( - len(pass_to_fail) - + len(pass_to_skip) - + len(pass_to_gone) - + len(fail_to_gone) - + len(discovery_regressions) - ) + # Get all tests across both runs + all_tests_union = baseline_data["all"] | current_data["all"] + + # Compute all state transitions + transitions: Dict[Tuple[str, str], List[str]] = {} + for from_state in STATES: + for to_state in STATES: + from_set = get_state_set(baseline_data, from_state, all_tests_union) + to_set = get_state_set(current_data, to_state, all_tests_union) + transitions[(from_state, to_state)] = sorted(from_set & to_set) + + # Discovery warnings (separate from state transitions) + discovery_warnings = sorted(current_data["warnings"] - baseline_data["warnings"]) + + # Calculate regression count (same logic as before, but expanded) + regression_transitions = [] + for (from_state, to_state), tests in transitions.items(): + if classify_transition(from_state, to_state) == "regression": + regression_transitions.extend(tests) + + regression_count = len(set(regression_transitions)) + len(discovery_warnings) has_regressions = regression_count > 0 + # Build analysis payload analysis_payload = { - "pass_to_fail": pass_to_fail, - "pass_to_skip": pass_to_skip, - "pass_to_gone": pass_to_gone, - "fail_to_gone": fail_to_gone, - "discovery_regressions": discovery_regressions, - "fail_to_skip": fail_to_skip, - "fail_to_pass": fail_to_pass, - "new_tests": new_tests, + "transitions": {f"{f}_to_{t}": tests for (f, t), tests in transitions.items()}, + "discovery_warnings": discovery_warnings, "counts": { - "pass_to_fail": len(pass_to_fail), - "pass_to_skip": len(pass_to_skip), - "pass_to_gone": len(pass_to_gone), - "fail_to_gone": len(fail_to_gone), - "discovery": len(discovery_regressions), - "fail_to_skip": len(fail_to_skip), - "fail_to_pass": len(fail_to_pass), - "new_tests": len(new_tests), + f"{f}_to_{t}": len(tests) for (f, t), tests in transitions.items() }, } + analysis_payload["counts"]["discovery_warnings"] = len(discovery_warnings) Path("regression_analysis.json").write_text( json.dumps(analysis_payload, indent=2), @@ -422,67 +485,67 @@ jobs: handle.write(f" {idx}. {test_name}\n") handle.write("\n") + # Write comprehensive text report with Path("comprehensive_regression_report.txt").open("w", encoding="utf-8") as report: report.write("COMPREHENSIVE REGRESSION ANALYSIS\n") report.write("=" * 50 + "\n\n") - write_section( - report, - "PASS-TO-FAIL REGRESSIONS", - pass_to_fail, - "Previously passing, now failing:", - ) - write_section( - report, - "PASS-TO-SKIP REGRESSIONS", - pass_to_skip, - "Previously passing, now skipped or xfailed:", - ) - write_section( - report, - "FAIL-TO-SKIP IMPROVEMENTS", - fail_to_skip, - "Previously failing, now skipped (treated as improvements):", - ) - write_section( - report, - "FAIL-TO-PASS IMPROVEMENTS", - fail_to_pass, - "Previously failing, now passing (treated as improvements):", - ) - write_section( - report, - "PASS-TO-GONE REGRESSIONS", - pass_to_gone, - "Previously passing, now completely missing:", - ) - write_section( - report, - "FAIL-TO-GONE REGRESSIONS", - fail_to_gone, - "Previously failing, now completely missing:", - ) - - if discovery_regressions: - report.write( - f"DISCOVERY REGRESSIONS ({len(discovery_regressions)} warnings)\n" - ) + # Write regressions + for from_state in STATES: + for to_state in STATES: + tests = transitions[(from_state, to_state)] + if tests and classify_transition(from_state, to_state) == "regression": + write_section( + report, + f"{from_state.upper()}-TO-{to_state.upper()} REGRESSIONS", + tests, + f"Previously {from_state.lower()}, now {to_state.lower()}:", + ) + + # Write improvements + for from_state in STATES: + for to_state in STATES: + tests = transitions[(from_state, to_state)] + if tests and classify_transition(from_state, to_state) == "improvement": + write_section( + report, + f"{from_state.upper()}-TO-{to_state.upper()} IMPROVEMENTS", + tests, + f"Previously {from_state.lower()}, now {to_state.lower()}:", + ) + + if discovery_warnings: + report.write(f"DISCOVERY WARNINGS ({len(discovery_warnings)} new)\n") report.write("New warnings not present in baseline:\n") - for idx, warning in enumerate(discovery_regressions, 1): + for idx, warning in enumerate(discovery_warnings, 1): truncated = (warning[:200] + "...") if len(warning) > 200 else warning report.write(f" {idx}. {truncated}\n") report.write("\n") - write_section( - report, - "NEW TESTS", - new_tests, - "Tests present only in the current run:", - ) + # Write new tests + for to_state in STATES: + if to_state == "Nonexistent": + continue + tests = transitions[("Nonexistent", to_state)] + if tests: + write_section( + report, + f"NEW TESTS ({to_state.upper()})", + tests, + f"New tests in {to_state.lower()} state:", + ) - if not has_regressions and not (fail_to_skip or fail_to_pass or new_tests): - report.write("No regressions or test suite changes detected.\n") + if not has_regressions: + any_changes = any( + len(tests) > 0 + for (f, t), tests in transitions.items() + if f != t + ) + if not any_changes and not discovery_warnings: + report.write("No regressions or test suite changes detected.\n") + # Write pass-to-fail details for backwards compatibility + pass_to_fail = transitions[("Pass", "Fail")] if pass_to_fail: with Path("regression_details.txt").open("w", encoding="utf-8") as handle: handle.write( @@ -496,19 +559,24 @@ jobs: encoding="utf-8", ) - table_rows = [ - ("Pass → Fail", len(pass_to_fail)), - ("Pass → Skip/XFail", len(pass_to_skip)), - ("Pass → Gone", len(pass_to_gone)), - ("Fail → Gone", len(fail_to_gone)), - ("Discovery Warnings", len(discovery_regressions)), - ("Fail → Skip (Improvement)", len(fail_to_skip)), - ("Fail → Pass (Improvement)", len(fail_to_pass)), - ("New Tests", len(new_tests)), - ] + # Build the matrix table for GitHub summary + def build_matrix_table() -> List[str]: + """Build markdown matrix table with state transitions.""" + # Header row + header = "| Baseline ↓ / Current → | " + " | ".join(STATES) + " |" + separator = "| --- | " + " | ".join(["---"] * len(STATES)) + " |" - summary_lines = ["| Category | Count |", "| --- | --- |"] - summary_lines.extend([f"| {label} | {count} |" for label, count in table_rows]) + rows = [header, separator] + + for from_state in STATES: + cells = [] + for to_state in STATES: + count = len(transitions[(from_state, to_state)]) + classification = classify_transition(from_state, to_state) + cells.append(format_cell(count, classification)) + rows.append(f"| {from_state} | " + " | ".join(cells) + " |") + + return rows def write_summary_section(f, title: str, tests: List[str], max_show: int = 20) -> None: """Write a collapsible section with test names to the summary.""" @@ -525,52 +593,104 @@ jobs: summary_path = os.environ.get("GITHUB_STEP_SUMMARY") if summary_path: with Path(summary_path).open("a", encoding="utf-8") as summary_file: - summary_file.write("### Regression Breakdown\n\n") - summary_file.write("\n".join(summary_lines) + "\n") - if fail_to_skip or fail_to_pass: - summary_file.write("\n_Improvements are highlighted for visibility and do not fail the job._\n") - - # Write detailed test lists - write_summary_section(summary_file, "❌ Pass → Fail", pass_to_fail) - write_summary_section(summary_file, "⚠️ Pass → Skip/XFail", pass_to_skip) - write_summary_section(summary_file, "🔴 Pass → Gone", pass_to_gone) - write_summary_section(summary_file, "🟡 Fail → Gone", fail_to_gone) - write_summary_section(summary_file, "✅ Fail → Pass (Improvement)", fail_to_pass) - write_summary_section(summary_file, "⏭️ Fail → Skip (Improvement)", fail_to_skip) - write_summary_section(summary_file, "🆕 New Tests", new_tests) - - if discovery_regressions: - summary_file.write(f"\n
\n⚠️ Discovery Warnings ({len(discovery_regressions)} new)\n\n") + summary_file.write("### Regression Matrix\n\n") + summary_file.write("**Bold** = regression, *Italic* = improvement, `-` = no change\n\n") + summary_file.write("\n".join(build_matrix_table()) + "\n") + + # Discovery warnings section (between matrix and details) + if discovery_warnings: + summary_file.write(f"\n### New Discovery Warnings ({len(discovery_warnings)})\n\n") + summary_file.write("
\nView warnings\n\n") summary_file.write("```\n") - for warning in discovery_regressions[:10]: + for warning in discovery_warnings[:10]: truncated = (warning[:300] + "...") if len(warning) > 300 else warning summary_file.write(f"{truncated}\n\n") - if len(discovery_regressions) > 10: - summary_file.write(f"... and {len(discovery_regressions) - 10} more\n") + if len(discovery_warnings) > 10: + summary_file.write(f"... and {len(discovery_warnings) - 10} more\n") summary_file.write("```\n
\n") - print("📊 Regression Analysis Results:") - for label, count in table_rows: - print(f" {label}: {count}") + # Collapsible detail sections + summary_file.write("\n### Test Details\n") + + # Regressions first + for from_state in STATES: + for to_state in STATES: + tests = transitions[(from_state, to_state)] + classification = classify_transition(from_state, to_state) + if tests and classification == "regression": + write_summary_section( + summary_file, + f"❌ {from_state} → {to_state}", + tests + ) + + # Improvements + for from_state in STATES: + for to_state in STATES: + tests = transitions[(from_state, to_state)] + classification = classify_transition(from_state, to_state) + if tests and classification == "improvement": + write_summary_section( + summary_file, + f"✅ {from_state} → {to_state}", + tests + ) + + # Neutrals (excluding diagonal and new tests) + for from_state in STATES: + for to_state in STATES: + tests = transitions[(from_state, to_state)] + classification = classify_transition(from_state, to_state) + if tests and classification == "neutral": + write_summary_section( + summary_file, + f"➡️ {from_state} → {to_state}", + tests + ) + + # New tests + for to_state in STATES: + if to_state == "Nonexistent": + continue + tests = transitions[("Nonexistent", to_state)] + if tests: + write_summary_section( + summary_file, + f"🆕 New {to_state}", + tests + ) + + # Console output + print("📊 Regression Matrix:") + for line in build_matrix_table(): + print(f" {line}") + + if discovery_warnings: + print(f"\n⚠️ New Discovery Warnings: {len(discovery_warnings)}") + if has_regressions: - print(f"❌ Total regressions detected: {regression_count}") + print(f"\n❌ Total regressions detected: {regression_count}") else: - print("✅ No regressions detected in monitored categories.") + print("\n✅ No regressions detected.") def sanitize(value: str) -> str: return value.replace("%", "%25").replace("\n", "%0A").replace("\r", "%0D") + # Backwards compatible outputs plus new matrix counts + pass_to_skip = transitions[("Pass", "Skip")] + transitions[("Pass", "XFail")] outputs = { "has_regressions": "true" if has_regressions else "false", "regression_count": str(regression_count), - "pass_to_fail_count": str(len(pass_to_fail)), + "pass_to_fail_count": str(len(transitions[("Pass", "Fail")])), "pass_to_skip_count": str(len(pass_to_skip)), - "pass_to_gone_count": str(len(pass_to_gone)), - "fail_to_gone_count": str(len(fail_to_gone)), - "discovery_regression_count": str(len(discovery_regressions)), - "fail_to_skip_count": str(len(fail_to_skip)), - "fail_to_pass_count": str(len(fail_to_pass)), - "new_tests_count": str(len(new_tests)), + "pass_to_gone_count": str(len(transitions[("Pass", "Nonexistent")])), + "fail_to_gone_count": str(len(transitions[("Fail", "Nonexistent")])), + "discovery_regression_count": str(len(discovery_warnings)), + "fail_to_skip_count": str(len(transitions[("Fail", "Skip")])), + "fail_to_pass_count": str(len(transitions[("Fail", "Pass")])), + "new_tests_count": str(sum( + len(transitions[("Nonexistent", s)]) for s in STATES if s != "Nonexistent" + )), } github_output = os.environ.get("GITHUB_OUTPUT") diff --git a/.github/workflows/test-cs-nunit.yml b/.github/workflows/test-cs-nunit.yml index ae9f7c7..719baf9 100644 --- a/.github/workflows/test-cs-nunit.yml +++ b/.github/workflows/test-cs-nunit.yml @@ -213,6 +213,7 @@ jobs: pr_passed = 0 pr_percentage = 0 failing_tests = [] + error_tests = [] skipped_tests = [] xfailed_tests = [] all_tests = [] @@ -250,8 +251,10 @@ jobs: all_tests.append(nodeid) # Track all tests regardless of outcome if outcome == 'passed': passing_tests.append(nodeid) - elif outcome in ['failed', 'error']: + elif outcome == 'failed': failing_tests.append(nodeid) + elif outcome == 'error': + error_tests.append(nodeid) elif outcome == 'skipped': skipped_tests.append(nodeid) # Extract skip reason @@ -279,15 +282,16 @@ jobs: elif 'call' in test and test['call'] and 'longrepr' in test['call']: xfail_reason = str(test['call']['longrepr']) xfailed_tests_with_reasons[nodeid] = xfail_reason.strip() - + print(f'Found {len(passing_tests)} passing tests') print(f'Found {len(failing_tests)} failing tests') + print(f'Found {len(error_tests)} error tests') print(f'Found {len(skipped_tests)} skipped tests') print(f'Found {len(xfailed_tests)} xfailed tests') print(f'Found {len(all_tests)} total discovered tests') else: print('No valid summary structure found') - + # Calculate percentage safely pr_percentage = (pr_passed / pr_total * 100) if pr_total > 0 else 0 print(f'Pass percentage calculated: {pr_percentage:.2f}%') @@ -309,6 +313,7 @@ jobs: print(f'Passed tests: {pr_passed}') print(f'Pass percentage: {pr_percentage:.2f}%') print(f'Failing tests: {len(failing_tests)}') + print(f'Error tests: {len(error_tests)}') print(f'Skipped tests: {len(skipped_tests)}') print(f'Xfailed tests: {len(xfailed_tests)}') print(f'All discovered tests: {len(all_tests)}') @@ -319,6 +324,7 @@ jobs: f.write(f'passed={pr_passed}\\n') f.write(f'percentage={pr_percentage:.2f}\\n') f.write(f'failing_count={len(failing_tests)}\\n') + f.write(f'error_count={len(error_tests)}\\n') f.write(f'skipped_count={len(skipped_tests)}\\n') f.write(f'xfailed_count={len(xfailed_tests)}\\n') @@ -365,6 +371,7 @@ jobs: test_data = { 'passing_tests': passing_tests, 'failing_tests': failing_tests, + 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, 'all_tests': all_tests, @@ -545,6 +552,7 @@ jobs: target_percentage = 0 passing_tests = [] failing_tests = [] + error_tests = [] skipped_tests = [] xfailed_tests = [] all_tests = [] @@ -579,15 +587,18 @@ jobs: all_tests.append(nodeid) # Track all tests regardless of outcome if outcome == 'passed': passing_tests.append(nodeid) - elif outcome in ['failed', 'error']: + elif outcome == 'failed': failing_tests.append(nodeid) + elif outcome == 'error': + error_tests.append(nodeid) elif outcome == 'skipped': skipped_tests.append(nodeid) elif outcome == 'xfailed': xfailed_tests.append(nodeid) - + print(f'Found {len(passing_tests)} passing tests') print(f'Found {len(failing_tests)} failing tests') + print(f'Found {len(error_tests)} error tests') print(f'Found {len(skipped_tests)} skipped tests') print(f'Found {len(xfailed_tests)} xfailed tests') print(f'Found {len(all_tests)} total discovered tests') @@ -667,6 +678,7 @@ jobs: test_data = { 'passing_tests': passing_tests, 'failing_tests': failing_tests, + 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, 'all_tests': all_tests, diff --git a/.github/workflows/test-cs-xunit.yml b/.github/workflows/test-cs-xunit.yml index a7e6f25..e0eca3c 100644 --- a/.github/workflows/test-cs-xunit.yml +++ b/.github/workflows/test-cs-xunit.yml @@ -213,6 +213,7 @@ jobs: pr_passed = 0 pr_percentage = 0 failing_tests = [] + error_tests = [] skipped_tests = [] xfailed_tests = [] all_tests = [] @@ -250,8 +251,10 @@ jobs: all_tests.append(nodeid) # Track all tests regardless of outcome if outcome == 'passed': passing_tests.append(nodeid) - elif outcome in ['failed', 'error']: + elif outcome == 'failed': failing_tests.append(nodeid) + elif outcome == 'error': + error_tests.append(nodeid) elif outcome == 'skipped': skipped_tests.append(nodeid) # Extract skip reason @@ -365,6 +368,7 @@ jobs: test_data = { 'passing_tests': passing_tests, 'failing_tests': failing_tests, + 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, 'all_tests': all_tests, @@ -545,6 +549,7 @@ jobs: target_percentage = 0 passing_tests = [] failing_tests = [] + error_tests = [] skipped_tests = [] xfailed_tests = [] all_tests = [] @@ -579,15 +584,18 @@ jobs: all_tests.append(nodeid) # Track all tests regardless of outcome if outcome == 'passed': passing_tests.append(nodeid) - elif outcome in ['failed', 'error']: + elif outcome == 'failed': failing_tests.append(nodeid) + elif outcome == 'error': + error_tests.append(nodeid) elif outcome == 'skipped': skipped_tests.append(nodeid) elif outcome == 'xfailed': xfailed_tests.append(nodeid) - + print(f'Found {len(passing_tests)} passing tests') print(f'Found {len(failing_tests)} failing tests') + print(f'Found {len(error_tests)} error tests') print(f'Found {len(skipped_tests)} skipped tests') print(f'Found {len(xfailed_tests)} xfailed tests') print(f'Found {len(all_tests)} total discovered tests') @@ -667,6 +675,7 @@ jobs: test_data = { 'passing_tests': passing_tests, 'failing_tests': failing_tests, + 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, 'all_tests': all_tests, diff --git a/.github/workflows/test-py-pytest.yml b/.github/workflows/test-py-pytest.yml index 51c0468..3bf53cf 100644 --- a/.github/workflows/test-py-pytest.yml +++ b/.github/workflows/test-py-pytest.yml @@ -52,6 +52,9 @@ on: failing_count: description: "Number of failing tests" value: ${{ jobs.test.outputs.failing_count }} + error_count: + description: "Number of errored tests" + value: ${{ jobs.test.outputs.error_count }} skipped_count: description: "Number of skipped tests" value: ${{ jobs.test.outputs.skipped_count }} @@ -71,6 +74,7 @@ jobs: has_errors: ${{ steps.check-collection.outputs.has_errors }} error_type: ${{ steps.check-collection.outputs.error_type }} failing_count: ${{ steps.extract-results.outputs.failing_count }} + error_count: ${{ steps.extract-results.outputs.error_count }} skipped_count: ${{ steps.extract-results.outputs.skipped_count }} xfailed_count: ${{ steps.extract-results.outputs.xfailed_count }} @@ -185,6 +189,7 @@ jobs: percentage = 0.0 passing_tests = [] failing_tests = [] + error_tests = [] skipped_tests = [] xfailed_tests = [] all_tests = [] @@ -209,8 +214,10 @@ jobs: all_tests.append(nodeid) if outcome == 'passed': passing_tests.append(nodeid) - elif outcome in ['failed', 'error']: + elif outcome == 'failed': failing_tests.append(nodeid) + elif outcome == 'error': + error_tests.append(nodeid) elif outcome == 'skipped': skipped_tests.append(nodeid) reason = test.get('longrepr', 'No reason') @@ -257,6 +264,7 @@ jobs: json.dump({ 'passing_tests': passing_tests, 'failing_tests': failing_tests, + 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, 'all_tests': all_tests, @@ -272,6 +280,7 @@ jobs: f.write(f'passed={passed}\n') f.write(f'percentage={percentage:.2f}\n') f.write(f'failing_count={len(failing_tests)}\n') + f.write(f'error_count={len(error_tests)}\n') f.write(f'skipped_count={len(skipped_tests)}\n') f.write(f'xfailed_count={len(xfailed_tests)}\n') " From ff8a93a8f0a0c3390289cd7e670d89a9c6d83841 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Dec 2025 18:33:40 +0000 Subject: [PATCH 2/5] Use branch names in regression matrix header Replace generic "Baseline" and "Current" labels with actual branch names from workflow inputs (baseline_label, current_label) --- .github/workflows/regression-test.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/regression-test.yml b/.github/workflows/regression-test.yml index 65a9107..e097be9 100644 --- a/.github/workflows/regression-test.yml +++ b/.github/workflows/regression-test.yml @@ -284,6 +284,9 @@ jobs: PY - name: Analyze regression data id: analyze + env: + BASELINE_LABEL: ${{ inputs.baseline_label }} + CURRENT_LABEL: ${{ inputs.current_label }} run: | python3 - <<'PY' import json @@ -291,6 +294,10 @@ jobs: from pathlib import Path from typing import List, TextIO, Dict, Set, Tuple + # Get branch labels from environment + BASELINE_LABEL = os.environ.get("BASELINE_LABEL", "Baseline") + CURRENT_LABEL = os.environ.get("CURRENT_LABEL", "Current") + # States ordered from best to worst for matrix display # This ordering puts improvements below diagonal, regressions above STATES = ["Pass", "Skip", "XFail", "Fail", "Error", "Nonexistent"] @@ -562,8 +569,8 @@ jobs: # Build the matrix table for GitHub summary def build_matrix_table() -> List[str]: """Build markdown matrix table with state transitions.""" - # Header row - header = "| Baseline ↓ / Current → | " + " | ".join(STATES) + " |" + # Header row with branch names + header = f"| {BASELINE_LABEL} ↓ / {CURRENT_LABEL} → | " + " | ".join(STATES) + " |" separator = "| --- | " + " | ".join(["---"] * len(STATES)) + " |" rows = [header, separator] From b417e74ff9a7c720113c9e5bab985db1ed891bc0 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Dec 2025 17:24:25 +0000 Subject: [PATCH 3/5] Fix error_tests tracking mismatch in run-branch-test.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The target branch inline extraction code was grouping 'error' outcomes with 'failed', while test-py-pytest.yml tracks them separately. This mismatch caused phantom state transitions in the regression matrix (e.g., thousands of "Fail → Error" shown because the baseline grouped errors with failures but the current branch tracked them separately). Changes: - Add error_tests list to track error outcomes separately - Split 'elif outcome in ["failed", "error"]' into separate conditions - Add error_tests to test_data.json output - Add error_count to job outputs and cache --- .github/workflows/run-branch-test.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-branch-test.yml b/.github/workflows/run-branch-test.yml index 4f06379..c91b2f7 100644 --- a/.github/workflows/run-branch-test.yml +++ b/.github/workflows/run-branch-test.yml @@ -84,6 +84,7 @@ jobs: has_errors: ${{ steps.results.outputs.has_errors }} error_type: ${{ steps.results.outputs.error_type }} failing_count: ${{ steps.results.outputs.failing_count }} + error_count: ${{ steps.results.outputs.error_count }} skipped_count: ${{ steps.results.outputs.skipped_count }} xfailed_count: ${{ steps.results.outputs.xfailed_count }} @@ -343,6 +344,7 @@ jobs: percentage = 0.0 passing_tests = [] failing_tests = [] + error_tests = [] skipped_tests = [] xfailed_tests = [] all_tests = [] @@ -367,8 +369,10 @@ jobs: all_tests.append(nodeid) if outcome == 'passed': passing_tests.append(nodeid) - elif outcome in ['failed', 'error']: + elif outcome == 'failed': failing_tests.append(nodeid) + elif outcome == 'error': + error_tests.append(nodeid) elif outcome == 'skipped': skipped_tests.append(nodeid) reason = test.get('longrepr', 'No reason') @@ -415,6 +419,7 @@ jobs: json.dump({ 'passing_tests': passing_tests, 'failing_tests': failing_tests, + 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, 'all_tests': all_tests, @@ -430,6 +435,7 @@ jobs: f.write(f'passed={passed}\n') f.write(f'percentage={percentage:.2f}\n') f.write(f'failing_count={len(failing_tests)}\n') + f.write(f'error_count={len(error_tests)}\n') f.write(f'skipped_count={len(skipped_tests)}\n') f.write(f'xfailed_count={len(xfailed_tests)}\n') " @@ -460,6 +466,7 @@ jobs: has_errors=${{ steps.check-collection.outputs.has_errors || 'false' }} error_type=${{ steps.check-collection.outputs.error_type || 'none' }} failing_count=${{ steps.extract-results.outputs.failing_count || '0' }} + error_count=${{ steps.extract-results.outputs.error_count || '0' }} skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }} xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }} EOF @@ -501,6 +508,7 @@ jobs: echo "has_errors=${{ steps.load-cache.outputs.has_errors || 'false' }}" >> $GITHUB_OUTPUT echo "error_type=${{ steps.load-cache.outputs.error_type || 'none' }}" >> $GITHUB_OUTPUT echo "failing_count=${{ steps.load-cache.outputs.failing_count || '0' }}" >> $GITHUB_OUTPUT + echo "error_count=${{ steps.load-cache.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT echo "skipped_count=${{ steps.load-cache.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT echo "xfailed_count=${{ steps.load-cache.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT else @@ -513,6 +521,7 @@ jobs: echo "has_errors=${{ steps.check-collection.outputs.has_errors || 'false' }}" >> $GITHUB_OUTPUT echo "error_type=${{ steps.check-collection.outputs.error_type || 'none' }}" >> $GITHUB_OUTPUT echo "failing_count=${{ steps.extract-results.outputs.failing_count || '0' }}" >> $GITHUB_OUTPUT + echo "error_count=${{ steps.extract-results.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT echo "skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT echo "xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT fi From 18662a0e9eb969ae3acb72c0455b8ff86e0eab55 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 21:28:16 +0000 Subject: [PATCH 4/5] Add cache version to invalidate stale test results Old cached results may have errors grouped with failures (pre-fix). Adding a version string (v2) to the cache key ensures all PRs get fresh test results with proper error tracking. Bump CACHE_VERSION when extraction logic changes in the future. --- .github/workflows/run-branch-test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run-branch-test.yml b/.github/workflows/run-branch-test.yml index c91b2f7..db014d0 100644 --- a/.github/workflows/run-branch-test.yml +++ b/.github/workflows/run-branch-test.yml @@ -93,7 +93,9 @@ jobs: - name: Set cache keys id: cache-keys run: | - BASE_KEY="pytest-${{ inputs.target_branch }}-${{ github.event.pull_request.base.sha || github.sha }}" + # Version bump forces cache invalidation when extraction logic changes + CACHE_VERSION="v2" + BASE_KEY="pytest-${CACHE_VERSION}-${{ inputs.target_branch }}-${{ github.event.pull_request.base.sha || github.sha }}" echo "base_key=$BASE_KEY" >> $GITHUB_OUTPUT echo "pending_key=${BASE_KEY}-pending-${{ github.run_id }}" >> $GITHUB_OUTPUT echo "🔍 Cache base key: $BASE_KEY" From 5ec4f330f6dff1a0f9ef050c07359f31b8963ec8 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Dec 2025 21:32:48 +0000 Subject: [PATCH 5/5] Add xpassed (unexpected pass) state tracking Adds tracking for pytest's 'xpassed' outcome - when a test marked with @pytest.mark.xfail unexpectedly passes. Changes: - test-py-pytest.yml: Track xpassed_tests separately - run-branch-test.yml: Track xpassed_tests in inline extraction code - regression-test.yml: Add XPass to STATES matrix (7x7 now) - XPass placed between Pass and Skip in state ordering - Pass <-> XPass transitions are neutral (marker change only) - XPass to worse states = regression - Worse states to XPass = improvement - Bump cache version to v3 to invalidate stale results --- .github/workflows/regression-test.yml | 23 +++++++++++++++++------ .github/workflows/run-branch-test.yml | 11 ++++++++++- .github/workflows/test-py-pytest.yml | 5 +++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/.github/workflows/regression-test.yml b/.github/workflows/regression-test.yml index e097be9..108644b 100644 --- a/.github/workflows/regression-test.yml +++ b/.github/workflows/regression-test.yml @@ -300,8 +300,8 @@ jobs: # States ordered from best to worst for matrix display # This ordering puts improvements below diagonal, regressions above - STATES = ["Pass", "Skip", "XFail", "Fail", "Error", "Nonexistent"] - STATE_KEYS = ["passing", "skipped", "xfailed", "failing", "error", "nonexistent"] + STATES = ["Pass", "XPass", "Skip", "XFail", "Fail", "Error", "Nonexistent"] + STATE_KEYS = ["passing", "xpassed", "skipped", "xfailed", "failing", "error", "nonexistent"] def load_json(path: str) -> dict: file_path = Path(path) @@ -350,6 +350,8 @@ jobs: data["skipped"].add(str(test_id)) elif status in {"xfailed", "xfail"}: data["xfailed"].add(str(test_id)) + elif status in {"xpassed", "xpass"}: + data["xpassed"].add(str(test_id)) else: data["other"].add(str(test_id)) @@ -360,6 +362,7 @@ jobs: "error": set(coerce_list(raw.get("error_tests"))), "skipped": set(coerce_list(raw.get("skipped_tests"))), "xfailed": set(coerce_list(raw.get("xfailed_tests"))), + "xpassed": set(coerce_list(raw.get("xpassed_tests"))), "warnings": set(coerce_list(raw.get("warnings"))), "all": set(coerce_list(raw.get("all_tests"))), "other": set(), @@ -374,6 +377,7 @@ jobs: | data["error"] | data["skipped"] | data["xfailed"] + | data["xpassed"] | data["other"] ) @@ -393,19 +397,25 @@ jobs: # Anything to Nonexistent is regression if to_state == "Nonexistent": return "regression" - # Pass to anything else is regression - if from_state == "Pass": + # Pass/XPass to anything else (except each other) is regression + if from_state == "Pass" and to_state != "XPass": + return "regression" + if from_state == "XPass" and to_state != "Pass": return "regression" - # Skip to Fail/Error already covered above # Neutrals if from_state == "Fail" and to_state == "XFail": return "neutral" if from_state == "Skip" and to_state == "XFail": return "neutral" + # Pass <-> XPass is neutral (just adding/removing xfail marker) + if from_state == "Pass" and to_state == "XPass": + return "neutral" + if from_state == "XPass" and to_state == "Pass": + return "neutral" # Improvements (italic): moving to a better state - if to_state == "Pass": + if to_state in ["Pass", "XPass"]: return "improvement" if from_state in ["Fail", "Error"] and to_state == "Skip": return "improvement" @@ -432,6 +442,7 @@ jobs: """Get the set of tests in a given state.""" state_map = { "Pass": "passing", + "XPass": "xpassed", "Skip": "skipped", "XFail": "xfailed", "Fail": "failing", diff --git a/.github/workflows/run-branch-test.yml b/.github/workflows/run-branch-test.yml index db014d0..678e834 100644 --- a/.github/workflows/run-branch-test.yml +++ b/.github/workflows/run-branch-test.yml @@ -87,6 +87,7 @@ jobs: error_count: ${{ steps.results.outputs.error_count }} skipped_count: ${{ steps.results.outputs.skipped_count }} xfailed_count: ${{ steps.results.outputs.xfailed_count }} + xpassed_count: ${{ steps.results.outputs.xpassed_count }} steps: # Define cache keys @@ -94,7 +95,7 @@ jobs: id: cache-keys run: | # Version bump forces cache invalidation when extraction logic changes - CACHE_VERSION="v2" + CACHE_VERSION="v3" BASE_KEY="pytest-${CACHE_VERSION}-${{ inputs.target_branch }}-${{ github.event.pull_request.base.sha || github.sha }}" echo "base_key=$BASE_KEY" >> $GITHUB_OUTPUT echo "pending_key=${BASE_KEY}-pending-${{ github.run_id }}" >> $GITHUB_OUTPUT @@ -349,6 +350,7 @@ jobs: error_tests = [] skipped_tests = [] xfailed_tests = [] + xpassed_tests = [] all_tests = [] skipped_with_reasons = {} xfailed_with_reasons = {} @@ -387,6 +389,8 @@ jobs: if isinstance(reason, list): reason = reason[0] if reason else 'No reason' xfailed_with_reasons[nodeid] = str(reason).strip() + elif outcome == 'xpassed': + xpassed_tests.append(nodeid) percentage = (passed / total * 100) if total > 0 else 0 except FileNotFoundError: @@ -424,6 +428,7 @@ jobs: 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, + 'xpassed_tests': xpassed_tests, 'all_tests': all_tests, 'skipped_tests_with_reasons': skipped_with_reasons, 'xfailed_tests_with_reasons': xfailed_with_reasons, @@ -440,6 +445,7 @@ jobs: f.write(f'error_count={len(error_tests)}\n') f.write(f'skipped_count={len(skipped_tests)}\n') f.write(f'xfailed_count={len(xfailed_tests)}\n') + f.write(f'xpassed_count={len(xpassed_tests)}\n') " - name: Save results to cache @@ -471,6 +477,7 @@ jobs: error_count=${{ steps.extract-results.outputs.error_count || '0' }} skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }} xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }} + xpassed_count=${{ steps.extract-results.outputs.xpassed_count || '0' }} EOF # Remove leading whitespace from the env file sed -i 's/^[[:space:]]*//' cached_target/outputs.env @@ -513,6 +520,7 @@ jobs: echo "error_count=${{ steps.load-cache.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT echo "skipped_count=${{ steps.load-cache.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT echo "xfailed_count=${{ steps.load-cache.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT + echo "xpassed_count=${{ steps.load-cache.outputs.xpassed_count || '0' }}" >> $GITHUB_OUTPUT else echo "📋 Using fresh results" echo "total=${{ steps.extract-results.outputs.total || '0' }}" >> $GITHUB_OUTPUT @@ -526,6 +534,7 @@ jobs: echo "error_count=${{ steps.extract-results.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT echo "skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT echo "xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT + echo "xpassed_count=${{ steps.extract-results.outputs.xpassed_count || '0' }}" >> $GITHUB_OUTPUT fi # Compare results diff --git a/.github/workflows/test-py-pytest.yml b/.github/workflows/test-py-pytest.yml index 3bf53cf..22d65a4 100644 --- a/.github/workflows/test-py-pytest.yml +++ b/.github/workflows/test-py-pytest.yml @@ -192,6 +192,7 @@ jobs: error_tests = [] skipped_tests = [] xfailed_tests = [] + xpassed_tests = [] all_tests = [] skipped_with_reasons = {} xfailed_with_reasons = {} @@ -230,6 +231,8 @@ jobs: if isinstance(reason, list): reason = reason[0] if reason else 'No reason' xfailed_with_reasons[nodeid] = str(reason).strip() + elif outcome == 'xpassed': + xpassed_tests.append(nodeid) percentage = (passed / total * 100) if total > 0 else 0 except FileNotFoundError: @@ -267,6 +270,7 @@ jobs: 'error_tests': error_tests, 'skipped_tests': skipped_tests, 'xfailed_tests': xfailed_tests, + 'xpassed_tests': xpassed_tests, 'all_tests': all_tests, 'skipped_tests_with_reasons': skipped_with_reasons, 'xfailed_tests_with_reasons': xfailed_with_reasons, @@ -283,6 +287,7 @@ jobs: f.write(f'error_count={len(error_tests)}\n') f.write(f'skipped_count={len(skipped_tests)}\n') f.write(f'xfailed_count={len(xfailed_tests)}\n') + f.write(f'xpassed_count={len(xpassed_tests)}\n') " - name: Upload test artifacts