From a1da8b3e331a03ae6bb71b7ca98ba808e3f323c8 Mon Sep 17 00:00:00 2001
From: Daniel Ibarrola Sanchez <dibarrola@griddynamics.com>
Date: Mon, 29 Dec 2025 15:36:29 +0000
Subject: [PATCH] Add retry mechanism for workflow runs

---
 culprit_finder/README.md                      |  1 +
 culprit_finder/src/culprit_finder/cli.py      |  9 ++++
 .../src/culprit_finder/culprit_finder.py      | 31 ++++++++++---
 culprit_finder/tests/test_cli.py              |  2 +
 culprit_finder/tests/test_culprit_finder.py   | 46 +++++++++++++++++--
 5 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/culprit_finder/README.md b/culprit_finder/README.md
index 8edea27..5aae23a 100644
--- a/culprit_finder/README.md
+++ b/culprit_finder/README.md
@@ -57,6 +57,7 @@ culprit-finder --repo <OWNER/REPO> --start <GOOD_SHA> --end <BAD_SHA> --workflow
 - `--start`: The full or short SHA of the last known **good** commit.
 - `--end`: The full or short SHA of the first known **bad** commit.
 - `--workflow`: The filename of the GitHub Actions workflow to run (e.g., `ci.yml`, `tests.yaml`).
+- `--retry`: (Optional) Number of times to retry the workflow run if it fails (default: 0).
 - `--clear-cache`: (Optional) Deletes the local state file before execution to start a fresh bisection.
 
 ### State Persistence and Resuming
diff --git a/culprit_finder/src/culprit_finder/cli.py b/culprit_finder/src/culprit_finder/cli.py
index 37d7668..75b9266 100644
--- a/culprit_finder/src/culprit_finder/cli.py
+++ b/culprit_finder/src/culprit_finder/cli.py
@@ -55,6 +55,13 @@ def main() -> None:
     action="store_true",
     help="Deletes the local state file before execution",
   )
+  parser.add_argument(
+    "--retry",
+    required=False,
+    help="Number of times to retry the workflow run if it fails (default: 0).",
+    default=0,
+    type=int,
+  )
 
   args = parser.parse_args()
 
@@ -107,6 +114,7 @@ def main() -> None:
   logging.info("Start commit: %s", args.start)
   logging.info("End commit: %s", args.end)
   logging.info("Workflow: %s", args.workflow)
+  logging.info("Retries: %s", args.retry)
 
   has_culprit_finder_workflow = any(
     wf["path"] == ".github/workflows/culprit_finder.yml"
@@ -124,6 +132,7 @@ def main() -> None:
     github_client=gh_client,
     state=state,
     state_persister=state_persister,
+    retries=args.retry,
   )
 
   try:
diff --git a/culprit_finder/src/culprit_finder/culprit_finder.py b/culprit_finder/src/culprit_finder/culprit_finder.py
index ce29408..d98dc66 100644
--- a/culprit_finder/src/culprit_finder/culprit_finder.py
+++ b/culprit_finder/src/culprit_finder/culprit_finder.py
@@ -27,6 +27,7 @@ def __init__(
     github_client: github.GithubClient,
     state: culprit_finder_state.CulpritFinderState,
     state_persister: culprit_finder_state.StatePersister,
+    retries: int = 0,
   ):
     """
     Initializes the CulpritFinder instance.
@@ -40,6 +41,7 @@ def __init__(
         github_client: The GithubClient instance used to interact with GitHub.
         state: The CulpritFinderState object containing the current bisection state.
         state_persister: The StatePersister object used to save the bisection state.
+        retries: Number of times to retry workflow runs in case of failure.
     """
     self._repo = repo
     self._start_sha = start_sha
@@ -50,6 +52,7 @@ def __init__(
     self._gh_client = github_client
     self._state = state
     self._state_persister = state_persister
+    self._retries = retries
 
   def _wait_for_workflow_completion(
     self,
@@ -145,14 +148,28 @@ def _test_commit(
       inputs,
     )
 
-    run = self._wait_for_workflow_completion(
-      workflow_to_trigger,
-      branch_name,
-      commit_sha,
-      previous_run_id,
-    )
+    run: github.Run | None = None
+    for attempt in range(self._retries + 1):
+      run = self._wait_for_workflow_completion(
+        workflow_to_trigger,
+        branch_name,
+        commit_sha,
+        previous_run_id,
+      )
+
+      if run and run["conclusion"] == "success":
+        return True
+
+      if attempt < self._retries:
+        logging.info(
+          "Retrying workflow for commit %s (attempt %d/%d)",
+          commit_sha,
+          attempt + 1,
+          self._retries,
+        )
+
     if not run:
-      logging.error("Workflow failed to complete")
+      logging.error("Workflow failed to complete for commit %s", commit_sha)
       return False
 
     return run["conclusion"] == "success"
diff --git a/culprit_finder/tests/test_cli.py b/culprit_finder/tests/test_cli.py
index 23e5c09..b278890 100644
--- a/culprit_finder/tests/test_cli.py
+++ b/culprit_finder/tests/test_cli.py
@@ -235,6 +235,7 @@ def test_cli_success(
     workflow_file="test.yml",
     has_culprit_finder_workflow=has_culprit_workflow,
     github_client=mock_gh_client_instance,
+    retries=0,
     state=expected_state,
     state_persister=patches["state_persister_inst"],
   )
@@ -310,6 +311,7 @@ def test_cli_state_management(
       state=existing_state,
       github_client=mock_gh_client_instance,
       state_persister=patches["state_persister_inst"],
+      retries=0,
     )
   else:
     # If not exists or discarded, new state created
diff --git a/culprit_finder/tests/test_culprit_finder.py b/culprit_finder/tests/test_culprit_finder.py
index f2a7fd7..ae9f944 100644
--- a/culprit_finder/tests/test_culprit_finder.py
+++ b/culprit_finder/tests/test_culprit_finder.py
@@ -24,9 +24,8 @@ def mock_state_persister(mocker):
 
 
 @pytest.fixture
-def finder(request, mock_gh_client, mock_state_persister):
-  """Returns a CulpritFinder instance for testing."""
-  state: culprit_finder_state.CulpritFinderState = {
+def mock_state() -> culprit_finder_state.CulpritFinderState:
+  return {
     "repo": "test_repo",
     "workflow": "test_workflow",
     "original_start": "original_start_sha",
@@ -35,6 +34,11 @@ def finder(request, mock_gh_client, mock_state_persister):
     "current_bad": "",
     "cache": {},
   }
+
+
+@pytest.fixture
+def finder(request, mock_gh_client, mock_state_persister, mock_state):
+  """Returns a CulpritFinder instance for testing."""
   has_culprit_finder_workflow = getattr(request, "param", True)
   return culprit_finder.CulpritFinder(
     repo=REPO,
@@ -43,7 +47,7 @@ def finder(request, mock_gh_client, mock_state_persister):
     workflow_file=WORKFLOW_FILE,
     has_culprit_finder_workflow=has_culprit_finder_workflow,
     github_client=mock_gh_client,
-    state=state,
+    state=mock_state,
     state_persister=mock_state_persister,
   )
 
@@ -91,6 +95,40 @@ def test_wait_for_workflow_completion_success(mocker, finder, mock_gh_client):
     assert call_args[0][0] == workflow
 
 
+def test_test_commit_with_retries(
+  mocker, mock_gh_client, mock_state, mock_state_persister
+):
+  """Tests that _test_commit retries the specified number of times on failure."""
+  mocker.patch("culprit_finder.culprit_finder.github")
+
+  finder = culprit_finder.CulpritFinder(
+    repo=REPO,
+    start_sha="start_sha",
+    end_sha="end_sha",
+    workflow_file=WORKFLOW_FILE,
+    has_culprit_finder_workflow=True,
+    github_client=mock_gh_client,
+    state=mock_state,
+    state_persister=mock_state_persister,
+    retries=2,
+  )
+
+  branch = "test-branch"
+  commit_sha = "sha1"
+
+  mock_wait = mocker.patch.object(finder, "_wait_for_workflow_completion")
+  mock_wait.side_effect = [
+    {"conclusion": "failure"},  # Initial attempt
+    {"conclusion": "failure"},  # First retry
+    {"conclusion": "success"},  # Second retry
+  ]
+
+  is_good = finder._test_commit(commit_sha, branch)
+
+  assert is_good is True
+  assert mock_wait.call_count == 3
+
+
 @pytest.mark.parametrize("finder", [True, False], indirect=True)
 def test_test_commit_success(mocker, finder, mock_gh_client):
   """Tests that _test_commit triggers the workflow and returns True on success."""