From 5f4df34abce1fede5fedf08911bfe42c5f8b9e24 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Dec 2025 18:22:19 +0000
Subject: [PATCH 1/5] Pivot regression breakdown to matrix table format

- Replace linear regression breakdown with 6x6 state transition matrix
- States ordered: Pass, Skip, XFail, Fail, Error, Nonexistent
- Improvements appear below diagonal, regressions above
- Bold formatting for regressions, italic for improvements
- Add Error as separate state (previously grouped with Fail)
- Track error_tests separately in pytest, nunit, xunit workflows
- Move discovery warnings section between matrix and detail lists
- Keep collapsible detail sections for test names below matrix
---
 .github/workflows/regression-test.yml | 380 +++++++++++++++++---------
 .github/workflows/test-cs-nunit.yml   |  22 +-
 .github/workflows/test-cs-xunit.yml   |  15 +-
 .github/workflows/test-py-pytest.yml  |  11 +-
 4 files changed, 289 insertions(+), 139 deletions(-)

diff --git a/.github/workflows/regression-test.yml b/.github/workflows/regression-test.yml
index 42dcd9c..65a9107 100644
--- a/.github/workflows/regression-test.yml
+++ b/.github/workflows/regression-test.yml
@@ -289,7 +289,12 @@ jobs:
           import json
           import os
           from pathlib import Path
-          from typing import List, TextIO
+          from typing import List, TextIO, Dict, Set, Tuple
+
+          # States ordered from best to worst for matrix display
+          # This ordering puts improvements below diagonal, regressions above
+          STATES = ["Pass", "Skip", "XFail", "Fail", "Error", "Nonexistent"]
+          STATE_KEYS = ["passing", "skipped", "xfailed", "failing", "error", "nonexistent"]
 
           def load_json(path: str) -> dict:
               file_path = Path(path)
@@ -330,8 +335,10 @@ jobs:
                   status = status.lower()
                   if status in {"passed", "pass"}:
                       data["passing"].add(str(test_id))
-                  elif status in {"failed", "fail", "error"}:
+                  elif status in {"failed", "fail"}:
                       data["failing"].add(str(test_id))
+                  elif status == "error":
+                      data["error"].add(str(test_id))
                   elif status in {"skipped", "skip"}:
                       data["skipped"].add(str(test_id))
                   elif status in {"xfailed", "xfail"}:
@@ -343,6 +350,7 @@ jobs:
               data = {
                   "passing": set(coerce_list(raw.get("passing_tests"))),
                   "failing": set(coerce_list(raw.get("failing_tests"))),
+                  "error": set(coerce_list(raw.get("error_tests"))),
                   "skipped": set(coerce_list(raw.get("skipped_tests"))),
                   "xfailed": set(coerce_list(raw.get("xfailed_tests"))),
                   "warnings": set(coerce_list(raw.get("warnings"))),
@@ -356,6 +364,7 @@ jobs:
                   data["all"].update(
                       data["passing"]
                       | data["failing"]
+                      | data["error"]
                       | data["skipped"]
                       | data["xfailed"]
                       | data["other"]
@@ -363,50 +372,104 @@ jobs:
 
               return data
 
+          def classify_transition(from_state: str, to_state: str) -> str:
+              """Classify a state transition as regression, improvement, neutral, or new."""
+              if from_state == to_state:
+                  return "diagonal"
+              if from_state == "Nonexistent":
+                  return "new"
+
+              # Regressions (bold): moving to a worse state
+              # Anything to Fail or Error is regression
+              if to_state in ["Fail", "Error"]:
+                  return "regression"
+              # Anything to Nonexistent is regression
+              if to_state == "Nonexistent":
+                  return "regression"
+              # Pass to anything else is regression
+              if from_state == "Pass":
+                  return "regression"
+              # Skip to Fail/Error already covered above
+
+              # Neutrals
+              if from_state == "Fail" and to_state == "XFail":
+                  return "neutral"
+              if from_state == "Skip" and to_state == "XFail":
+                  return "neutral"
+
+              # Improvements (italic): moving to a better state
+              if to_state == "Pass":
+                  return "improvement"
+              if from_state in ["Fail", "Error"] and to_state == "Skip":
+                  return "improvement"
+              if from_state == "Error" and to_state in ["XFail", "Fail"]:
+                  return "improvement"
+              if from_state == "XFail" and to_state == "Skip":
+                  return "improvement"
+
+              return "neutral"
+
+          def format_cell(count: int, classification: str) -> str:
+              """Format a cell value based on its classification."""
+              if classification == "diagonal":
+                  return "-"
+              if count == 0:
+                  return "0"
+              if classification == "regression":
+                  return f"**{count}**"
+              if classification == "improvement":
+                  return f"*{count}*"
+              return str(count)
+
+          def get_state_set(data: dict, state: str, all_tests: Set[str]) -> Set[str]:
+              """Get the set of tests in a given state."""
+              state_map = {
+                  "Pass": "passing",
+                  "Skip": "skipped",
+                  "XFail": "xfailed",
+                  "Fail": "failing",
+                  "Error": "error",
+                  "Nonexistent": "nonexistent",
+              }
+              if state == "Nonexistent":
+                  return all_tests - data["all"]
+              return data.get(state_map[state], set())
+
           baseline_data = build_status_sets(load_json("baseline_results.json"))
           current_data = build_status_sets(load_json("current_results.json"))
 
-          pass_to_fail = sorted(baseline_data["passing"] & current_data["failing"])
-          pass_to_skip = sorted(
-              baseline_data["passing"] & (current_data["skipped"] | current_data["xfailed"])
-          )
-          pass_to_gone = sorted(baseline_data["passing"] - current_data["all"])
-          fail_to_gone = sorted(baseline_data["failing"] - current_data["all"])
-          discovery_regressions = sorted(current_data["warnings"] - baseline_data["warnings"])
-
-          fail_to_skip = sorted(baseline_data["failing"] & current_data["skipped"])
-          fail_to_pass = sorted(baseline_data["failing"] & current_data["passing"])
-          new_tests = sorted(current_data["all"] - baseline_data["all"])
-
-          regression_count = (
-              len(pass_to_fail)
-              + len(pass_to_skip)
-              + len(pass_to_gone)
-              + len(fail_to_gone)
-              + len(discovery_regressions)
-          )
+          # Get all tests across both runs
+          all_tests_union = baseline_data["all"] | current_data["all"]
+
+          # Compute all state transitions
+          transitions: Dict[Tuple[str, str], List[str]] = {}
+          for from_state in STATES:
+              for to_state in STATES:
+                  from_set = get_state_set(baseline_data, from_state, all_tests_union)
+                  to_set = get_state_set(current_data, to_state, all_tests_union)
+                  transitions[(from_state, to_state)] = sorted(from_set & to_set)
+
+          # Discovery warnings (separate from state transitions)
+          discovery_warnings = sorted(current_data["warnings"] - baseline_data["warnings"])
+
+          # Calculate regression count (same logic as before, but expanded)
+          regression_transitions = []
+          for (from_state, to_state), tests in transitions.items():
+              if classify_transition(from_state, to_state) == "regression":
+                  regression_transitions.extend(tests)
+
+          regression_count = len(set(regression_transitions)) + len(discovery_warnings)
           has_regressions = regression_count > 0
 
+          # Build analysis payload
           analysis_payload = {
-              "pass_to_fail": pass_to_fail,
-              "pass_to_skip": pass_to_skip,
-              "pass_to_gone": pass_to_gone,
-              "fail_to_gone": fail_to_gone,
-              "discovery_regressions": discovery_regressions,
-              "fail_to_skip": fail_to_skip,
-              "fail_to_pass": fail_to_pass,
-              "new_tests": new_tests,
+              "transitions": {f"{f}_to_{t}": tests for (f, t), tests in transitions.items()},
+              "discovery_warnings": discovery_warnings,
               "counts": {
-                  "pass_to_fail": len(pass_to_fail),
-                  "pass_to_skip": len(pass_to_skip),
-                  "pass_to_gone": len(pass_to_gone),
-                  "fail_to_gone": len(fail_to_gone),
-                  "discovery": len(discovery_regressions),
-                  "fail_to_skip": len(fail_to_skip),
-                  "fail_to_pass": len(fail_to_pass),
-                  "new_tests": len(new_tests),
+                  f"{f}_to_{t}": len(tests) for (f, t), tests in transitions.items()
               },
           }
+          analysis_payload["counts"]["discovery_warnings"] = len(discovery_warnings)
 
           Path("regression_analysis.json").write_text(
               json.dumps(analysis_payload, indent=2),
@@ -422,67 +485,67 @@ jobs:
                   handle.write(f"  {idx}. {test_name}\n")
               handle.write("\n")
 
+          # Write comprehensive text report
           with Path("comprehensive_regression_report.txt").open("w", encoding="utf-8") as report:
               report.write("COMPREHENSIVE REGRESSION ANALYSIS\n")
               report.write("=" * 50 + "\n\n")
 
-              write_section(
-                  report,
-                  "PASS-TO-FAIL REGRESSIONS",
-                  pass_to_fail,
-                  "Previously passing, now failing:",
-              )
-              write_section(
-                  report,
-                  "PASS-TO-SKIP REGRESSIONS",
-                  pass_to_skip,
-                  "Previously passing, now skipped or xfailed:",
-              )
-              write_section(
-                  report,
-                  "FAIL-TO-SKIP IMPROVEMENTS",
-                  fail_to_skip,
-                  "Previously failing, now skipped (treated as improvements):",
-              )
-              write_section(
-                  report,
-                  "FAIL-TO-PASS IMPROVEMENTS",
-                  fail_to_pass,
-                  "Previously failing, now passing (treated as improvements):",
-              )
-              write_section(
-                  report,
-                  "PASS-TO-GONE REGRESSIONS",
-                  pass_to_gone,
-                  "Previously passing, now completely missing:",
-              )
-              write_section(
-                  report,
-                  "FAIL-TO-GONE REGRESSIONS",
-                  fail_to_gone,
-                  "Previously failing, now completely missing:",
-              )
-
-              if discovery_regressions:
-                  report.write(
-                      f"DISCOVERY REGRESSIONS ({len(discovery_regressions)} warnings)\n"
-                  )
+              # Write regressions
+              for from_state in STATES:
+                  for to_state in STATES:
+                      tests = transitions[(from_state, to_state)]
+                      if tests and classify_transition(from_state, to_state) == "regression":
+                          write_section(
+                              report,
+                              f"{from_state.upper()}-TO-{to_state.upper()} REGRESSIONS",
+                              tests,
+                              f"Previously {from_state.lower()}, now {to_state.lower()}:",
+                          )
+
+              # Write improvements
+              for from_state in STATES:
+                  for to_state in STATES:
+                      tests = transitions[(from_state, to_state)]
+                      if tests and classify_transition(from_state, to_state) == "improvement":
+                          write_section(
+                              report,
+                              f"{from_state.upper()}-TO-{to_state.upper()} IMPROVEMENTS",
+                              tests,
+                              f"Previously {from_state.lower()}, now {to_state.lower()}:",
+                          )
+
+              if discovery_warnings:
+                  report.write(f"DISCOVERY WARNINGS ({len(discovery_warnings)} new)\n")
                   report.write("New warnings not present in baseline:\n")
-                  for idx, warning in enumerate(discovery_regressions, 1):
+                  for idx, warning in enumerate(discovery_warnings, 1):
                       truncated = (warning[:200] + "...") if len(warning) > 200 else warning
                       report.write(f"  {idx}. {truncated}\n")
                   report.write("\n")
 
-              write_section(
-                  report,
-                  "NEW TESTS",
-                  new_tests,
-                  "Tests present only in the current run:",
-              )
+              # Write new tests
+              for to_state in STATES:
+                  if to_state == "Nonexistent":
+                      continue
+                  tests = transitions[("Nonexistent", to_state)]
+                  if tests:
+                      write_section(
+                          report,
+                          f"NEW TESTS ({to_state.upper()})",
+                          tests,
+                          f"New tests in {to_state.lower()} state:",
+                      )
 
-              if not has_regressions and not (fail_to_skip or fail_to_pass or new_tests):
-                  report.write("No regressions or test suite changes detected.\n")
+              if not has_regressions:
+                  any_changes = any(
+                      len(tests) > 0
+                      for (f, t), tests in transitions.items()
+                      if f != t
+                  )
+                  if not any_changes and not discovery_warnings:
+                      report.write("No regressions or test suite changes detected.\n")
 
+          # Write pass-to-fail details for backwards compatibility
+          pass_to_fail = transitions[("Pass", "Fail")]
           if pass_to_fail:
               with Path("regression_details.txt").open("w", encoding="utf-8") as handle:
                   handle.write(
@@ -496,19 +559,24 @@ jobs:
                   encoding="utf-8",
               )
 
-          table_rows = [
-              ("Pass → Fail", len(pass_to_fail)),
-              ("Pass → Skip/XFail", len(pass_to_skip)),
-              ("Pass → Gone", len(pass_to_gone)),
-              ("Fail → Gone", len(fail_to_gone)),
-              ("Discovery Warnings", len(discovery_regressions)),
-              ("Fail → Skip (Improvement)", len(fail_to_skip)),
-              ("Fail → Pass (Improvement)", len(fail_to_pass)),
-              ("New Tests", len(new_tests)),
-          ]
+          # Build the matrix table for GitHub summary
+          def build_matrix_table() -> List[str]:
+              """Build markdown matrix table with state transitions."""
+              # Header row
+              header = "| Baseline ↓ / Current → | " + " | ".join(STATES) + " |"
+              separator = "| --- | " + " | ".join(["---"] * len(STATES)) + " |"
 
-          summary_lines = ["| Category | Count |", "| --- | --- |"]
-          summary_lines.extend([f"| {label} | {count} |" for label, count in table_rows])
+              rows = [header, separator]
+
+              for from_state in STATES:
+                  cells = []
+                  for to_state in STATES:
+                      count = len(transitions[(from_state, to_state)])
+                      classification = classify_transition(from_state, to_state)
+                      cells.append(format_cell(count, classification))
+                  rows.append(f"| {from_state} | " + " | ".join(cells) + " |")
+
+              return rows
 
           def write_summary_section(f, title: str, tests: List[str], max_show: int = 20) -> None:
               """Write a collapsible section with test names to the summary."""
@@ -525,52 +593,104 @@ jobs:
           summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
           if summary_path:
               with Path(summary_path).open("a", encoding="utf-8") as summary_file:
-                  summary_file.write("### Regression Breakdown\n\n")
-                  summary_file.write("\n".join(summary_lines) + "\n")
-                  if fail_to_skip or fail_to_pass:
-                      summary_file.write("\n_Improvements are highlighted for visibility and do not fail the job._\n")
-
-                  # Write detailed test lists
-                  write_summary_section(summary_file, "❌ Pass → Fail", pass_to_fail)
-                  write_summary_section(summary_file, "⚠️ Pass → Skip/XFail", pass_to_skip)
-                  write_summary_section(summary_file, "🔴 Pass → Gone", pass_to_gone)
-                  write_summary_section(summary_file, "🟡 Fail → Gone", fail_to_gone)
-                  write_summary_section(summary_file, "✅ Fail → Pass (Improvement)", fail_to_pass)
-                  write_summary_section(summary_file, "⏭️ Fail → Skip (Improvement)", fail_to_skip)
-                  write_summary_section(summary_file, "🆕 New Tests", new_tests)
-
-                  if discovery_regressions:
-                      summary_file.write(f"\n<details>\n<summary><strong>⚠️ Discovery Warnings</strong> ({len(discovery_regressions)} new)</summary>\n\n")
+                  summary_file.write("### Regression Matrix\n\n")
+                  summary_file.write("**Bold** = regression, *Italic* = improvement, `-` = no change\n\n")
+                  summary_file.write("\n".join(build_matrix_table()) + "\n")
+
+                  # Discovery warnings section (between matrix and details)
+                  if discovery_warnings:
+                      summary_file.write(f"\n### New Discovery Warnings ({len(discovery_warnings)})\n\n")
+                      summary_file.write("<details>\n<summary>View warnings</summary>\n\n")
                       summary_file.write("```\n")
-                      for warning in discovery_regressions[:10]:
+                      for warning in discovery_warnings[:10]:
                           truncated = (warning[:300] + "...") if len(warning) > 300 else warning
                           summary_file.write(f"{truncated}\n\n")
-                      if len(discovery_regressions) > 10:
-                          summary_file.write(f"... and {len(discovery_regressions) - 10} more\n")
+                      if len(discovery_warnings) > 10:
+                          summary_file.write(f"... and {len(discovery_warnings) - 10} more\n")
                       summary_file.write("```\n</details>\n")
 
-          print("📊 Regression Analysis Results:")
-          for label, count in table_rows:
-              print(f"  {label}: {count}")
+                  # Collapsible detail sections
+                  summary_file.write("\n### Test Details\n")
+
+                  # Regressions first
+                  for from_state in STATES:
+                      for to_state in STATES:
+                          tests = transitions[(from_state, to_state)]
+                          classification = classify_transition(from_state, to_state)
+                          if tests and classification == "regression":
+                              write_summary_section(
+                                  summary_file,
+                                  f"❌ {from_state} → {to_state}",
+                                  tests
+                              )
+
+                  # Improvements
+                  for from_state in STATES:
+                      for to_state in STATES:
+                          tests = transitions[(from_state, to_state)]
+                          classification = classify_transition(from_state, to_state)
+                          if tests and classification == "improvement":
+                              write_summary_section(
+                                  summary_file,
+                                  f"✅ {from_state} → {to_state}",
+                                  tests
+                              )
+
+                  # Neutrals (excluding diagonal and new tests)
+                  for from_state in STATES:
+                      for to_state in STATES:
+                          tests = transitions[(from_state, to_state)]
+                          classification = classify_transition(from_state, to_state)
+                          if tests and classification == "neutral":
+                              write_summary_section(
+                                  summary_file,
+                                  f"➡️ {from_state} → {to_state}",
+                                  tests
+                              )
+
+                  # New tests
+                  for to_state in STATES:
+                      if to_state == "Nonexistent":
+                          continue
+                      tests = transitions[("Nonexistent", to_state)]
+                      if tests:
+                          write_summary_section(
+                              summary_file,
+                              f"🆕 New {to_state}",
+                              tests
+                          )
+
+          # Console output
+          print("📊 Regression Matrix:")
+          for line in build_matrix_table():
+              print(f"  {line}")
+
+          if discovery_warnings:
+              print(f"\n⚠️ New Discovery Warnings: {len(discovery_warnings)}")
+
           if has_regressions:
-              print(f"❌ Total regressions detected: {regression_count}")
+              print(f"\n❌ Total regressions detected: {regression_count}")
           else:
-              print("✅ No regressions detected in monitored categories.")
+              print("\n✅ No regressions detected.")
 
           def sanitize(value: str) -> str:
               return value.replace("%", "%25").replace("\n", "%0A").replace("\r", "%0D")
 
+          # Backwards compatible outputs plus new matrix counts
+          pass_to_skip = transitions[("Pass", "Skip")] + transitions[("Pass", "XFail")]
           outputs = {
               "has_regressions": "true" if has_regressions else "false",
               "regression_count": str(regression_count),
-              "pass_to_fail_count": str(len(pass_to_fail)),
+              "pass_to_fail_count": str(len(transitions[("Pass", "Fail")])),
               "pass_to_skip_count": str(len(pass_to_skip)),
-              "pass_to_gone_count": str(len(pass_to_gone)),
-              "fail_to_gone_count": str(len(fail_to_gone)),
-              "discovery_regression_count": str(len(discovery_regressions)),
-              "fail_to_skip_count": str(len(fail_to_skip)),
-              "fail_to_pass_count": str(len(fail_to_pass)),
-              "new_tests_count": str(len(new_tests)),
+              "pass_to_gone_count": str(len(transitions[("Pass", "Nonexistent")])),
+              "fail_to_gone_count": str(len(transitions[("Fail", "Nonexistent")])),
+              "discovery_regression_count": str(len(discovery_warnings)),
+              "fail_to_skip_count": str(len(transitions[("Fail", "Skip")])),
+              "fail_to_pass_count": str(len(transitions[("Fail", "Pass")])),
+              "new_tests_count": str(sum(
+                  len(transitions[("Nonexistent", s)]) for s in STATES if s != "Nonexistent"
+              )),
           }
 
           github_output = os.environ.get("GITHUB_OUTPUT")
diff --git a/.github/workflows/test-cs-nunit.yml b/.github/workflows/test-cs-nunit.yml
index ae9f7c7..719baf9 100644
--- a/.github/workflows/test-cs-nunit.yml
+++ b/.github/workflows/test-cs-nunit.yml
@@ -213,6 +213,7 @@ jobs:
           pr_passed = 0
           pr_percentage = 0
           failing_tests = []
+          error_tests = []
           skipped_tests = []
           xfailed_tests = []
           all_tests = []
@@ -250,8 +251,10 @@ jobs:
                               all_tests.append(nodeid)  # Track all tests regardless of outcome
                               if outcome == 'passed':
                                   passing_tests.append(nodeid)
-                              elif outcome in ['failed', 'error']:
+                              elif outcome == 'failed':
                                   failing_tests.append(nodeid)
+                              elif outcome == 'error':
+                                  error_tests.append(nodeid)
                               elif outcome == 'skipped':
                                   skipped_tests.append(nodeid)
                                   # Extract skip reason
@@ -279,15 +282,16 @@ jobs:
                                   elif 'call' in test and test['call'] and 'longrepr' in test['call']:
                                       xfail_reason = str(test['call']['longrepr'])
                                   xfailed_tests_with_reasons[nodeid] = xfail_reason.strip()
-                      
+
                       print(f'Found {len(passing_tests)} passing tests')
                       print(f'Found {len(failing_tests)} failing tests')
+                      print(f'Found {len(error_tests)} error tests')
                       print(f'Found {len(skipped_tests)} skipped tests')
                       print(f'Found {len(xfailed_tests)} xfailed tests')
                       print(f'Found {len(all_tests)} total discovered tests')
               else:
                   print('No valid summary structure found')
-              
+
               # Calculate percentage safely
               pr_percentage = (pr_passed / pr_total * 100) if pr_total > 0 else 0
               print(f'Pass percentage calculated: {pr_percentage:.2f}%')
@@ -309,6 +313,7 @@ jobs:
           print(f'Passed tests: {pr_passed}')
           print(f'Pass percentage: {pr_percentage:.2f}%')
           print(f'Failing tests: {len(failing_tests)}')
+          print(f'Error tests: {len(error_tests)}')
           print(f'Skipped tests: {len(skipped_tests)}')
           print(f'Xfailed tests: {len(xfailed_tests)}')
           print(f'All discovered tests: {len(all_tests)}')
@@ -319,6 +324,7 @@ jobs:
               f.write(f'passed={pr_passed}\\n')
               f.write(f'percentage={pr_percentage:.2f}\\n')
               f.write(f'failing_count={len(failing_tests)}\\n')
+              f.write(f'error_count={len(error_tests)}\\n')
               f.write(f'skipped_count={len(skipped_tests)}\\n')
               f.write(f'xfailed_count={len(xfailed_tests)}\\n')
 
@@ -365,6 +371,7 @@ jobs:
           test_data = {
               'passing_tests': passing_tests,
               'failing_tests': failing_tests,
+              'error_tests': error_tests,
               'skipped_tests': skipped_tests,
               'xfailed_tests': xfailed_tests,
               'all_tests': all_tests,
@@ -545,6 +552,7 @@ jobs:
           target_percentage = 0
           passing_tests = []
           failing_tests = []
+          error_tests = []
           skipped_tests = []
           xfailed_tests = []
           all_tests = []
@@ -579,15 +587,18 @@ jobs:
                               all_tests.append(nodeid)  # Track all tests regardless of outcome
                               if outcome == 'passed':
                                   passing_tests.append(nodeid)
-                              elif outcome in ['failed', 'error']:
+                              elif outcome == 'failed':
                                   failing_tests.append(nodeid)
+                              elif outcome == 'error':
+                                  error_tests.append(nodeid)
                               elif outcome == 'skipped':
                                   skipped_tests.append(nodeid)
                               elif outcome == 'xfailed':
                                   xfailed_tests.append(nodeid)
-                      
+
                       print(f'Found {len(passing_tests)} passing tests')
                       print(f'Found {len(failing_tests)} failing tests')
+                      print(f'Found {len(error_tests)} error tests')
                       print(f'Found {len(skipped_tests)} skipped tests')
                       print(f'Found {len(xfailed_tests)} xfailed tests')
                       print(f'Found {len(all_tests)} total discovered tests')
@@ -667,6 +678,7 @@ jobs:
           test_data = {
               'passing_tests': passing_tests,
               'failing_tests': failing_tests,
+              'error_tests': error_tests,
               'skipped_tests': skipped_tests,
               'xfailed_tests': xfailed_tests,
               'all_tests': all_tests,
diff --git a/.github/workflows/test-cs-xunit.yml b/.github/workflows/test-cs-xunit.yml
index a7e6f25..e0eca3c 100644
--- a/.github/workflows/test-cs-xunit.yml
+++ b/.github/workflows/test-cs-xunit.yml
@@ -213,6 +213,7 @@ jobs:
           pr_passed = 0
           pr_percentage = 0
           failing_tests = []
+          error_tests = []
           skipped_tests = []
           xfailed_tests = []
           all_tests = []
@@ -250,8 +251,10 @@ jobs:
                               all_tests.append(nodeid)  # Track all tests regardless of outcome
                               if outcome == 'passed':
                                   passing_tests.append(nodeid)
-                              elif outcome in ['failed', 'error']:
+                              elif outcome == 'failed':
                                   failing_tests.append(nodeid)
+                              elif outcome == 'error':
+                                  error_tests.append(nodeid)
                               elif outcome == 'skipped':
                                   skipped_tests.append(nodeid)
                                   # Extract skip reason
@@ -365,6 +368,7 @@ jobs:
           test_data = {
               'passing_tests': passing_tests,
               'failing_tests': failing_tests,
+              'error_tests': error_tests,
               'skipped_tests': skipped_tests,
               'xfailed_tests': xfailed_tests,
               'all_tests': all_tests,
@@ -545,6 +549,7 @@ jobs:
           target_percentage = 0
           passing_tests = []
           failing_tests = []
+          error_tests = []
           skipped_tests = []
           xfailed_tests = []
           all_tests = []
@@ -579,15 +584,18 @@ jobs:
                               all_tests.append(nodeid)  # Track all tests regardless of outcome
                               if outcome == 'passed':
                                   passing_tests.append(nodeid)
-                              elif outcome in ['failed', 'error']:
+                              elif outcome == 'failed':
                                   failing_tests.append(nodeid)
+                              elif outcome == 'error':
+                                  error_tests.append(nodeid)
                               elif outcome == 'skipped':
                                   skipped_tests.append(nodeid)
                               elif outcome == 'xfailed':
                                   xfailed_tests.append(nodeid)
-                      
+
                       print(f'Found {len(passing_tests)} passing tests')
                       print(f'Found {len(failing_tests)} failing tests')
+                      print(f'Found {len(error_tests)} error tests')
                       print(f'Found {len(skipped_tests)} skipped tests')
                       print(f'Found {len(xfailed_tests)} xfailed tests')
                       print(f'Found {len(all_tests)} total discovered tests')
@@ -667,6 +675,7 @@ jobs:
           test_data = {
               'passing_tests': passing_tests,
               'failing_tests': failing_tests,
+              'error_tests': error_tests,
               'skipped_tests': skipped_tests,
               'xfailed_tests': xfailed_tests,
               'all_tests': all_tests,
diff --git a/.github/workflows/test-py-pytest.yml b/.github/workflows/test-py-pytest.yml
index 51c0468..3bf53cf 100644
--- a/.github/workflows/test-py-pytest.yml
+++ b/.github/workflows/test-py-pytest.yml
@@ -52,6 +52,9 @@ on:
       failing_count:
         description: "Number of failing tests"
         value: ${{ jobs.test.outputs.failing_count }}
+      error_count:
+        description: "Number of errored tests"
+        value: ${{ jobs.test.outputs.error_count }}
       skipped_count:
         description: "Number of skipped tests"
         value: ${{ jobs.test.outputs.skipped_count }}
@@ -71,6 +74,7 @@ jobs:
       has_errors: ${{ steps.check-collection.outputs.has_errors }}
       error_type: ${{ steps.check-collection.outputs.error_type }}
       failing_count: ${{ steps.extract-results.outputs.failing_count }}
+      error_count: ${{ steps.extract-results.outputs.error_count }}
       skipped_count: ${{ steps.extract-results.outputs.skipped_count }}
       xfailed_count: ${{ steps.extract-results.outputs.xfailed_count }}
 
@@ -185,6 +189,7 @@ jobs:
           percentage = 0.0
           passing_tests = []
           failing_tests = []
+          error_tests = []
           skipped_tests = []
           xfailed_tests = []
           all_tests = []
@@ -209,8 +214,10 @@ jobs:
                       all_tests.append(nodeid)
                       if outcome == 'passed':
                           passing_tests.append(nodeid)
-                      elif outcome in ['failed', 'error']:
+                      elif outcome == 'failed':
                           failing_tests.append(nodeid)
+                      elif outcome == 'error':
+                          error_tests.append(nodeid)
                       elif outcome == 'skipped':
                           skipped_tests.append(nodeid)
                           reason = test.get('longrepr', 'No reason')
@@ -257,6 +264,7 @@ jobs:
               json.dump({
                   'passing_tests': passing_tests,
                   'failing_tests': failing_tests,
+                  'error_tests': error_tests,
                   'skipped_tests': skipped_tests,
                   'xfailed_tests': xfailed_tests,
                   'all_tests': all_tests,
@@ -272,6 +280,7 @@ jobs:
               f.write(f'passed={passed}\n')
               f.write(f'percentage={percentage:.2f}\n')
               f.write(f'failing_count={len(failing_tests)}\n')
+              f.write(f'error_count={len(error_tests)}\n')
               f.write(f'skipped_count={len(skipped_tests)}\n')
               f.write(f'xfailed_count={len(xfailed_tests)}\n')
           "

From ff8a93a8f0a0c3390289cd7e670d89a9c6d83841 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Dec 2025 18:33:40 +0000
Subject: [PATCH 2/5] Use branch names in regression matrix header

Replace generic "Baseline" and "Current" labels with actual
branch names from workflow inputs (baseline_label, current_label)
---
 .github/workflows/regression-test.yml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/regression-test.yml b/.github/workflows/regression-test.yml
index 65a9107..e097be9 100644
--- a/.github/workflows/regression-test.yml
+++ b/.github/workflows/regression-test.yml
@@ -284,6 +284,9 @@ jobs:
           PY
       - name: Analyze regression data
         id: analyze
+        env:
+          BASELINE_LABEL: ${{ inputs.baseline_label }}
+          CURRENT_LABEL: ${{ inputs.current_label }}
         run: |
           python3 - <<'PY'
           import json
@@ -291,6 +294,10 @@ jobs:
           from pathlib import Path
           from typing import List, TextIO, Dict, Set, Tuple
 
+          # Get branch labels from environment
+          BASELINE_LABEL = os.environ.get("BASELINE_LABEL", "Baseline")
+          CURRENT_LABEL = os.environ.get("CURRENT_LABEL", "Current")
+
           # States ordered from best to worst for matrix display
           # This ordering puts improvements below diagonal, regressions above
           STATES = ["Pass", "Skip", "XFail", "Fail", "Error", "Nonexistent"]
@@ -562,8 +569,8 @@ jobs:
           # Build the matrix table for GitHub summary
           def build_matrix_table() -> List[str]:
               """Build markdown matrix table with state transitions."""
-              # Header row
-              header = "| Baseline ↓ / Current → | " + " | ".join(STATES) + " |"
+              # Header row with branch names
+              header = f"| {BASELINE_LABEL} ↓ / {CURRENT_LABEL} → | " + " | ".join(STATES) + " |"
               separator = "| --- | " + " | ".join(["---"] * len(STATES)) + " |"
 
               rows = [header, separator]

From b417e74ff9a7c720113c9e5bab985db1ed891bc0 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 20 Dec 2025 17:24:25 +0000
Subject: [PATCH 3/5] Fix error_tests tracking mismatch in run-branch-test.yml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The target branch inline extraction code was grouping 'error' outcomes
with 'failed', while test-py-pytest.yml tracks them separately. This
mismatch caused phantom state transitions in the regression matrix
(e.g., thousands of "Fail → Error" shown because the baseline grouped
errors with failures but the current branch tracked them separately).

Changes:
- Add error_tests list to track error outcomes separately
- Split 'elif outcome in ["failed", "error"]' into separate conditions
- Add error_tests to test_data.json output
- Add error_count to job outputs and cache
---
 .github/workflows/run-branch-test.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/run-branch-test.yml b/.github/workflows/run-branch-test.yml
index 4f06379..c91b2f7 100644
--- a/.github/workflows/run-branch-test.yml
+++ b/.github/workflows/run-branch-test.yml
@@ -84,6 +84,7 @@ jobs:
       has_errors: ${{ steps.results.outputs.has_errors }}
       error_type: ${{ steps.results.outputs.error_type }}
       failing_count: ${{ steps.results.outputs.failing_count }}
+      error_count: ${{ steps.results.outputs.error_count }}
       skipped_count: ${{ steps.results.outputs.skipped_count }}
       xfailed_count: ${{ steps.results.outputs.xfailed_count }}
 
@@ -343,6 +344,7 @@ jobs:
           percentage = 0.0
           passing_tests = []
           failing_tests = []
+          error_tests = []
           skipped_tests = []
           xfailed_tests = []
           all_tests = []
@@ -367,8 +369,10 @@ jobs:
                       all_tests.append(nodeid)
                       if outcome == 'passed':
                           passing_tests.append(nodeid)
-                      elif outcome in ['failed', 'error']:
+                      elif outcome == 'failed':
                           failing_tests.append(nodeid)
+                      elif outcome == 'error':
+                          error_tests.append(nodeid)
                       elif outcome == 'skipped':
                           skipped_tests.append(nodeid)
                           reason = test.get('longrepr', 'No reason')
@@ -415,6 +419,7 @@ jobs:
               json.dump({
                   'passing_tests': passing_tests,
                   'failing_tests': failing_tests,
+                  'error_tests': error_tests,
                   'skipped_tests': skipped_tests,
                   'xfailed_tests': xfailed_tests,
                   'all_tests': all_tests,
@@ -430,6 +435,7 @@ jobs:
               f.write(f'passed={passed}\n')
               f.write(f'percentage={percentage:.2f}\n')
               f.write(f'failing_count={len(failing_tests)}\n')
+              f.write(f'error_count={len(error_tests)}\n')
               f.write(f'skipped_count={len(skipped_tests)}\n')
               f.write(f'xfailed_count={len(xfailed_tests)}\n')
           "
@@ -460,6 +466,7 @@ jobs:
           has_errors=${{ steps.check-collection.outputs.has_errors || 'false' }}
           error_type=${{ steps.check-collection.outputs.error_type || 'none' }}
           failing_count=${{ steps.extract-results.outputs.failing_count || '0' }}
+          error_count=${{ steps.extract-results.outputs.error_count || '0' }}
           skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }}
           xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }}
           EOF
@@ -501,6 +508,7 @@ jobs:
             echo "has_errors=${{ steps.load-cache.outputs.has_errors || 'false' }}" >> $GITHUB_OUTPUT
             echo "error_type=${{ steps.load-cache.outputs.error_type || 'none' }}" >> $GITHUB_OUTPUT
             echo "failing_count=${{ steps.load-cache.outputs.failing_count || '0' }}" >> $GITHUB_OUTPUT
+            echo "error_count=${{ steps.load-cache.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT
             echo "skipped_count=${{ steps.load-cache.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT
             echo "xfailed_count=${{ steps.load-cache.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT
           else
@@ -513,6 +521,7 @@ jobs:
             echo "has_errors=${{ steps.check-collection.outputs.has_errors || 'false' }}" >> $GITHUB_OUTPUT
             echo "error_type=${{ steps.check-collection.outputs.error_type || 'none' }}" >> $GITHUB_OUTPUT
             echo "failing_count=${{ steps.extract-results.outputs.failing_count || '0' }}" >> $GITHUB_OUTPUT
+            echo "error_count=${{ steps.extract-results.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT
             echo "skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT
             echo "xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT
           fi

From 18662a0e9eb969ae3acb72c0455b8ff86e0eab55 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 21:28:16 +0000
Subject: [PATCH 4/5] Add cache version to invalidate stale test results

Old cached results may have errors grouped with failures (pre-fix).
Adding a version string (v2) to the cache key ensures all PRs get
fresh test results with proper error tracking.

Bump CACHE_VERSION when extraction logic changes in the future.
---
 .github/workflows/run-branch-test.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/run-branch-test.yml b/.github/workflows/run-branch-test.yml
index c91b2f7..db014d0 100644
--- a/.github/workflows/run-branch-test.yml
+++ b/.github/workflows/run-branch-test.yml
@@ -93,7 +93,9 @@ jobs:
       - name: Set cache keys
         id: cache-keys
         run: |
-          BASE_KEY="pytest-${{ inputs.target_branch }}-${{ github.event.pull_request.base.sha || github.sha }}"
+          # Version bump forces cache invalidation when extraction logic changes
+          CACHE_VERSION="v2"
+          BASE_KEY="pytest-${CACHE_VERSION}-${{ inputs.target_branch }}-${{ github.event.pull_request.base.sha || github.sha }}"
           echo "base_key=$BASE_KEY" >> $GITHUB_OUTPUT
           echo "pending_key=${BASE_KEY}-pending-${{ github.run_id }}" >> $GITHUB_OUTPUT
           echo "🔍 Cache base key: $BASE_KEY"

From 5ec4f330f6dff1a0f9ef050c07359f31b8963ec8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 21:32:48 +0000
Subject: [PATCH 5/5] Add xpassed (unexpected pass) state tracking

Adds tracking for pytest's 'xpassed' outcome - when a test marked with
@pytest.mark.xfail unexpectedly passes.

Changes:
- test-py-pytest.yml: Track xpassed_tests separately
- run-branch-test.yml: Track xpassed_tests in inline extraction code
- regression-test.yml: Add XPass to STATES matrix (7x7 now)
  - XPass placed between Pass and Skip in state ordering
  - Pass <-> XPass transitions are neutral (marker change only)
  - XPass to worse states = regression
  - Worse states to XPass = improvement
- Bump cache version to v3 to invalidate stale results
---
 .github/workflows/regression-test.yml | 23 +++++++++++++++++------
 .github/workflows/run-branch-test.yml | 11 ++++++++++-
 .github/workflows/test-py-pytest.yml  |  5 +++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/regression-test.yml b/.github/workflows/regression-test.yml
index e097be9..108644b 100644
--- a/.github/workflows/regression-test.yml
+++ b/.github/workflows/regression-test.yml
@@ -300,8 +300,8 @@ jobs:
 
           # States ordered from best to worst for matrix display
           # This ordering puts improvements below diagonal, regressions above
-          STATES = ["Pass", "Skip", "XFail", "Fail", "Error", "Nonexistent"]
-          STATE_KEYS = ["passing", "skipped", "xfailed", "failing", "error", "nonexistent"]
+          STATES = ["Pass", "XPass", "Skip", "XFail", "Fail", "Error", "Nonexistent"]
+          STATE_KEYS = ["passing", "xpassed", "skipped", "xfailed", "failing", "error", "nonexistent"]
 
           def load_json(path: str) -> dict:
               file_path = Path(path)
@@ -350,6 +350,8 @@ jobs:
                       data["skipped"].add(str(test_id))
                   elif status in {"xfailed", "xfail"}:
                       data["xfailed"].add(str(test_id))
+                  elif status in {"xpassed", "xpass"}:
+                      data["xpassed"].add(str(test_id))
                   else:
                       data["other"].add(str(test_id))
 
@@ -360,6 +362,7 @@ jobs:
                   "error": set(coerce_list(raw.get("error_tests"))),
                   "skipped": set(coerce_list(raw.get("skipped_tests"))),
                   "xfailed": set(coerce_list(raw.get("xfailed_tests"))),
+                  "xpassed": set(coerce_list(raw.get("xpassed_tests"))),
                   "warnings": set(coerce_list(raw.get("warnings"))),
                   "all": set(coerce_list(raw.get("all_tests"))),
                   "other": set(),
@@ -374,6 +377,7 @@ jobs:
                       | data["error"]
                       | data["skipped"]
                       | data["xfailed"]
+                      | data["xpassed"]
                       | data["other"]
                   )
 
@@ -393,19 +397,25 @@ jobs:
               # Anything to Nonexistent is regression
               if to_state == "Nonexistent":
                   return "regression"
-              # Pass to anything else is regression
-              if from_state == "Pass":
+              # Pass/XPass to anything else (except each other) is regression
+              if from_state == "Pass" and to_state != "XPass":
+                  return "regression"
+              if from_state == "XPass" and to_state != "Pass":
                   return "regression"
-              # Skip to Fail/Error already covered above
 
               # Neutrals
               if from_state == "Fail" and to_state == "XFail":
                   return "neutral"
               if from_state == "Skip" and to_state == "XFail":
                   return "neutral"
+              # Pass <-> XPass is neutral (just adding/removing xfail marker)
+              if from_state == "Pass" and to_state == "XPass":
+                  return "neutral"
+              if from_state == "XPass" and to_state == "Pass":
+                  return "neutral"
 
               # Improvements (italic): moving to a better state
-              if to_state == "Pass":
+              if to_state in ["Pass", "XPass"]:
                   return "improvement"
               if from_state in ["Fail", "Error"] and to_state == "Skip":
                   return "improvement"
@@ -432,6 +442,7 @@ jobs:
               """Get the set of tests in a given state."""
               state_map = {
                   "Pass": "passing",
+                  "XPass": "xpassed",
                   "Skip": "skipped",
                   "XFail": "xfailed",
                   "Fail": "failing",
diff --git a/.github/workflows/run-branch-test.yml b/.github/workflows/run-branch-test.yml
index db014d0..678e834 100644
--- a/.github/workflows/run-branch-test.yml
+++ b/.github/workflows/run-branch-test.yml
@@ -87,6 +87,7 @@ jobs:
       error_count: ${{ steps.results.outputs.error_count }}
       skipped_count: ${{ steps.results.outputs.skipped_count }}
       xfailed_count: ${{ steps.results.outputs.xfailed_count }}
+      xpassed_count: ${{ steps.results.outputs.xpassed_count }}
 
     steps:
       # Define cache keys
@@ -94,7 +95,7 @@ jobs:
         id: cache-keys
         run: |
           # Version bump forces cache invalidation when extraction logic changes
-          CACHE_VERSION="v2"
+          CACHE_VERSION="v3"
           BASE_KEY="pytest-${CACHE_VERSION}-${{ inputs.target_branch }}-${{ github.event.pull_request.base.sha || github.sha }}"
           echo "base_key=$BASE_KEY" >> $GITHUB_OUTPUT
           echo "pending_key=${BASE_KEY}-pending-${{ github.run_id }}" >> $GITHUB_OUTPUT
@@ -349,6 +350,7 @@ jobs:
           error_tests = []
           skipped_tests = []
           xfailed_tests = []
+          xpassed_tests = []
           all_tests = []
           skipped_with_reasons = {}
           xfailed_with_reasons = {}
@@ -387,6 +389,8 @@ jobs:
                           if isinstance(reason, list):
                               reason = reason[0] if reason else 'No reason'
                           xfailed_with_reasons[nodeid] = str(reason).strip()
+                      elif outcome == 'xpassed':
+                          xpassed_tests.append(nodeid)
 
               percentage = (passed / total * 100) if total > 0 else 0
           except FileNotFoundError:
@@ -424,6 +428,7 @@ jobs:
                   'error_tests': error_tests,
                   'skipped_tests': skipped_tests,
                   'xfailed_tests': xfailed_tests,
+                  'xpassed_tests': xpassed_tests,
                   'all_tests': all_tests,
                   'skipped_tests_with_reasons': skipped_with_reasons,
                   'xfailed_tests_with_reasons': xfailed_with_reasons,
@@ -440,6 +445,7 @@ jobs:
               f.write(f'error_count={len(error_tests)}\n')
               f.write(f'skipped_count={len(skipped_tests)}\n')
               f.write(f'xfailed_count={len(xfailed_tests)}\n')
+              f.write(f'xpassed_count={len(xpassed_tests)}\n')
           "
 
       - name: Save results to cache
@@ -471,6 +477,7 @@ jobs:
           error_count=${{ steps.extract-results.outputs.error_count || '0' }}
           skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }}
           xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }}
+          xpassed_count=${{ steps.extract-results.outputs.xpassed_count || '0' }}
           EOF
           # Remove leading whitespace from the env file
           sed -i 's/^[[:space:]]*//' cached_target/outputs.env
@@ -513,6 +520,7 @@ jobs:
             echo "error_count=${{ steps.load-cache.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT
             echo "skipped_count=${{ steps.load-cache.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT
             echo "xfailed_count=${{ steps.load-cache.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT
+            echo "xpassed_count=${{ steps.load-cache.outputs.xpassed_count || '0' }}" >> $GITHUB_OUTPUT
           else
             echo "📋 Using fresh results"
             echo "total=${{ steps.extract-results.outputs.total || '0' }}" >> $GITHUB_OUTPUT
@@ -526,6 +534,7 @@ jobs:
             echo "error_count=${{ steps.extract-results.outputs.error_count || '0' }}" >> $GITHUB_OUTPUT
             echo "skipped_count=${{ steps.extract-results.outputs.skipped_count || '0' }}" >> $GITHUB_OUTPUT
             echo "xfailed_count=${{ steps.extract-results.outputs.xfailed_count || '0' }}" >> $GITHUB_OUTPUT
+            echo "xpassed_count=${{ steps.extract-results.outputs.xpassed_count || '0' }}" >> $GITHUB_OUTPUT
           fi
 
   # Compare results
diff --git a/.github/workflows/test-py-pytest.yml b/.github/workflows/test-py-pytest.yml
index 3bf53cf..22d65a4 100644
--- a/.github/workflows/test-py-pytest.yml
+++ b/.github/workflows/test-py-pytest.yml
@@ -192,6 +192,7 @@ jobs:
           error_tests = []
           skipped_tests = []
           xfailed_tests = []
+          xpassed_tests = []
           all_tests = []
           skipped_with_reasons = {}
           xfailed_with_reasons = {}
@@ -230,6 +231,8 @@ jobs:
                           if isinstance(reason, list):
                               reason = reason[0] if reason else 'No reason'
                           xfailed_with_reasons[nodeid] = str(reason).strip()
+                      elif outcome == 'xpassed':
+                          xpassed_tests.append(nodeid)
 
               percentage = (passed / total * 100) if total > 0 else 0
           except FileNotFoundError:
@@ -267,6 +270,7 @@ jobs:
                   'error_tests': error_tests,
                   'skipped_tests': skipped_tests,
                   'xfailed_tests': xfailed_tests,
+                  'xpassed_tests': xpassed_tests,
                   'all_tests': all_tests,
                   'skipped_tests_with_reasons': skipped_with_reasons,
                   'xfailed_tests_with_reasons': xfailed_with_reasons,
@@ -283,6 +287,7 @@ jobs:
               f.write(f'error_count={len(error_tests)}\n')
               f.write(f'skipped_count={len(skipped_tests)}\n')
               f.write(f'xfailed_count={len(xfailed_tests)}\n')
+              f.write(f'xpassed_count={len(xpassed_tests)}\n')
           "
 
       - name: Upload test artifacts