From f2a09a3473273f3f129735a862d7c922131192a3 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Watenberg Date: Mon, 12 Jan 2026 09:36:16 +0100 Subject: [PATCH 1/2] Add babysit feature for automatic retries of failed GitHub Actions - Introduced `max_babysit_retries` setting in `settings.py` to configure the maximum number of retries (default is 5). - Added `babysit` option in command setup to enable automatic retries for failed builds. - Implemented new exceptions: `BabysitRetry`, `BabysitExhausted`, and `BabysitCancelled` in `exceptions.py` to handle different states of the babysit process. - Enhanced `Repository` class with a method to rerun failed workflow jobs. - Updated documentation to include details on the babysit feature and its usage. - Implemented logic to check and handle babysit retries in queue processing. This feature aims to reduce manual intervention for transient build failures in GitHub Actions. --- bert_e/docs/USER_DOC.md | 94 +++ bert_e/exceptions.py | 22 + bert_e/git_host/github/__init__.py | 38 ++ bert_e/git_host/github/schema.py | 3 + bert_e/settings.py | 3 + bert_e/templates/babysit_cancelled.md | 14 + bert_e/templates/babysit_exhausted.md | 23 + bert_e/templates/babysit_retry.md | 20 + bert_e/tests/unit/test_babysit.py | 785 +++++++++++++++++++++++ bert_e/workflow/gitwaterflow/__init__.py | 10 + bert_e/workflow/gitwaterflow/babysit.py | 290 +++++++++ bert_e/workflow/gitwaterflow/commands.py | 5 + bert_e/workflow/gitwaterflow/queueing.py | 137 +++- 13 files changed, 1443 insertions(+), 1 deletion(-) create mode 100644 bert_e/templates/babysit_cancelled.md create mode 100644 bert_e/templates/babysit_exhausted.md create mode 100644 bert_e/templates/babysit_retry.md create mode 100644 bert_e/tests/unit/test_babysit.py create mode 100644 bert_e/workflow/gitwaterflow/babysit.py diff --git a/bert_e/docs/USER_DOC.md b/bert_e/docs/USER_DOC.md index 54635034..4fd0a352 100644 --- a/bert_e/docs/USER_DOC.md +++ b/bert_e/docs/USER_DOC.md @@ -112,6 +112,7 @@ __Bert-E__. | options name | description | requires admin rights? | requires pull request author? | |:------------------------- |:------------------------ |:----------------------:|:-----------------------------:| | after_pull_request | Wait for the given pull request id to be merged before continuing with the current one. May be used like this: @bert-e after_pull_request=< pr_id_1 > ... | no | no +| babysit | Automatically retry failed GitHub Actions builds (see [Babysit](#babysit) section for details) | no | no | bypass_author_approval | Bypass the pull request author's approval | yes | no | bypass_build_status | Bypass the build and test status| yes | no | bypass_incompatible_branch | Bypass the check on the source branch prefix | yes | no @@ -482,6 +483,9 @@ to progress to the next step. message code | 122 | Unknown command | One of the participants asked __Bert-E__ to activate an option, or execute a command he doesn't know. Edit the corresponding message if it contains a typo. Delete it otherwise | 123 | Not authorized | One of the participants asked __Bert-E__ to activate a privileged option, or execute a privileged command, but doesn't have enough credentials to do so. Delete the corresponding command ask a __Bert-E__ administrator to run/set the desired command/option. | 134 | Not author | One of the participants asked __Bert-E__ to activate an authored option, but the participant is not the author of the pull request. +| 140 | Babysit: Retrying build | __Bert-E__ is automatically retrying failed GitHub Actions jobs because the babysit option is enabled. No action required - wait for the new build to complete. +| 141 | Babysit: Maximum retries reached | __Bert-E__ has exhausted all automatic retry attempts. Investigate the build failure. To get more retries, comment `@bert-e babysit` again. +| 142 | Babysit: Cancelled | __Bert-E__ cancelled the babysit option because new commits were pushed. To re-enable automatic retries, comment `@bert-e babysit` again. Queues ------ @@ -562,6 +566,96 @@ All those states can be found on Bert-E's UI. > Note: Bert-E will not notify the user if a build fails inside the queue. +Babysit +------- + +__The babysit option enables automatic retry of failed GitHub Actions builds.__ + +When working with GitHub Actions, builds can sometimes fail due to flaky tests, +transient infrastructure issues, or other temporary problems. The `babysit` +option allows __Bert-E__ to automatically retry failed workflow runs, reducing +the need for manual intervention. + +### Enabling Babysit + +To enable babysit on a pull request, comment: + + @bert-e babysit + +### How It Works + +When babysit is enabled and a build fails: + +1. __Bert-E__ detects the failed GitHub Actions workflow runs +2. For each failed workflow, __Bert-E__ triggers GitHub's "Re-run failed jobs" +3. __Bert-E__ posts a comment indicating the retry attempt +4. This process repeats until the build succeeds or the maximum retry limit + is reached + +### Scope of Babysit + +The babysit behavior applies to: + +* **Integration branches** (`w/x.y/...`): Failed builds on integration branches + are automatically retried +* **Queue branches** (`q/...`): Failed builds in the merge queue are also + retried if babysit was enabled on the corresponding pull request +* **All workflow runs individually**: Each GitHub Actions workflow is tracked + and retried independently. If you have multiple workflows (e.g., CI, Tests, + Lint), each one has its own retry counter. This means: + - If CI fails 5 times but Tests only fails twice, CI is exhausted while + Tests can still be retried 3 more times + - Only workflows that haven't reached their retry limit are retried + - __Bert-E__ shows a table with each workflow's retry count in the comments + +### Maximum Retries + +By default, __Bert-E__ will retry failed builds up to **5 times**. After the +maximum number of retries is reached, __Bert-E__ posts a `BabysitExhausted` +message indicating that automatic retries have been exhausted. + +This limit can be configured per repository by setting the +`max_babysit_retries` parameter in the repository's __Bert-E__ configuration: + +```yaml +max_babysit_retries: 10 # Allow up to 10 retries instead of the default 5 +``` + +### Re-enabling Babysit After Exhaustion + +If the maximum retries have been exhausted but you want to continue with +automatic retries, simply comment `@bert-e babysit` again. This resets the +retry counter and allows for another round of automatic retries. + +### Babysit Cancellation on New Commits + +**Important:** If you push new commits to your branch after enabling babysit, +the babysit option is automatically cancelled. This prevents stale retry +attempts from continuing on outdated code. + +When this happens, __Bert-E__ will post a `BabysitCancelled` message explaining +that new commits were detected. To re-enable automatic retries for the new +commits, you must comment `@bert-e babysit` again. + +> **Example workflow:** +> +> 1. You comment `@bert-e babysit` +> 2. Build fails, __Bert-E__ retries (attempt 1/5) +> 3. Build fails again, __Bert-E__ retries (attempt 2/5) +> 4. You push a fix to address the build failure +> 5. __Bert-E__ detects the new commit and cancels babysit +> 6. Build fails on the new commit +> 7. You comment `@bert-e babysit` again to enable retries for the new code +> 8. __Bert-E__ retries (attempt 1/5 - counter is reset) + +### Limitations + +* Babysit only works with **GitHub Actions** (`build_key: github_actions`) +* Babysit is not available for other CI systems (Bitbucket Pipelines, Jenkins, + etc.) +* Babysit does not bypass build failures - if the issue is not transient, the + build will continue to fail after all retries are exhausted + Going further with __Bert-E__ ----------------------------- Do you like __Bert-E__? Would like to use it on your own projects? diff --git a/bert_e/exceptions.py b/bert_e/exceptions.py index fb93bdcf..6fbb1e14 100644 --- a/bert_e/exceptions.py +++ b/bert_e/exceptions.py @@ -585,3 +585,25 @@ class JobFailure(SilentException): class QueueBuildFailed(SilentException): code = 309 + + +class BabysitRetry(TemplateException): + """Raised when babysit mode triggers a retry of failed GitHub Actions.""" + code = 140 + template = 'babysit_retry.md' + dont_repeat_if_in_history = 0 # allow repeating for each retry + status = "in_progress" + + +class BabysitExhausted(TemplateException): + """Raised when babysit mode has exhausted all retry attempts.""" + code = 141 + template = 'babysit_exhausted.md' + status = "failure" + + +class BabysitCancelled(TemplateException): + """Raised when babysit mode is cancelled due to new commits.""" + code = 142 + template = 'babysit_cancelled.md' + status = "in_progress" diff --git a/bert_e/git_host/github/__init__.py b/bert_e/git_host/github/__init__.py index c6daf27b..767cb5b8 100644 --- a/bert_e/git_host/github/__init__.py +++ b/bert_e/git_host/github/__init__.py @@ -518,6 +518,16 @@ def create_pull_request(self, title, src_branch, dst_branch, description, return PullRequest.create(self.client, data=kwargs, owner=self.owner, repo=self.slug) + def rerun_failed_workflow_jobs(self, run_id: int) -> None: + """Re-run only the failed jobs of a workflow run. + + Args: + run_id: The ID of the workflow run to re-run failed jobs for. + + """ + url = f'/repos/{self.owner}/{self.slug}/actions/runs/{run_id}/rerun-failed-jobs' + self.client.post(url, data='{}') + class AggregatedStatus(base.AbstractGitHostObject): GET_URL = '/repos/{owner}/{repo}/commits/{ref}/status' @@ -640,6 +650,34 @@ def branch(self) -> str | None: return self._workflow_runs[0]['head_branch'] return None + def get_failed_runs(self): + """Get workflow runs that have failed. + + This method filters workflow runs to keep only the most relevant run + per workflow (same logic as remove_unwanted_workflows), then returns + those that have failed. + + Returns: + List of dicts with 'id' and 'run_attempt' for each failed run. + """ + # First, filter to get the best run per workflow (same as state check) + self.remove_unwanted_workflows() + + failed_runs = [] + for run in self._workflow_runs: + if run.get('conclusion') == 'failure': + failed_runs.append({ + 'id': run['id'], + 'run_attempt': run.get('run_attempt', 1), + 'workflow_id': run.get('workflow_id'), + 'name': run.get('name', 'unknown'), + 'html_url': run.get('html_url', ''), + }) + LOG.debug( + "Babysit: found failed run id=%d, run_attempt=%d, name=%s", + run['id'], run.get('run_attempt', 1), run.get('name', '')) + return failed_runs + def remove_unwanted_workflows(self): """ Remove two things: diff --git a/bert_e/git_host/github/schema.py b/bert_e/git_host/github/schema.py index 60370202..42a8bfa5 100644 --- a/bert_e/git_host/github/schema.py +++ b/bert_e/git_host/github/schema.py @@ -143,6 +143,9 @@ class WorkflowRun(GitHubSchema): event = fields.Str() repository = fields.Nested(Repo) workflow_id = fields.Integer() + # run_attempt indicates the number of times this workflow has been run + # Defaults to 1 for first run, increments with each rerun + run_attempt = fields.Integer(load_default=1) class AggregateWorkflowRuns(GitHubSchema): diff --git a/bert_e/settings.py b/bert_e/settings.py index d00cf318..2140553a 100644 --- a/bert_e/settings.py +++ b/bert_e/settings.py @@ -195,6 +195,9 @@ class Meta: send_bot_status = fields.Bool(required=False, load_default=False) + # Babysit feature: automatic retry of failed GitHub Actions + max_babysit_retries = fields.Int(required=False, load_default=5) + @pre_load(pass_many=True) def load_env(self, data, **kwargs): """Load environment variables""" diff --git a/bert_e/templates/babysit_cancelled.md b/bert_e/templates/babysit_cancelled.md new file mode 100644 index 00000000..c4a39f46 --- /dev/null +++ b/bert_e/templates/babysit_cancelled.md @@ -0,0 +1,14 @@ +{% extends "message.md" %} + +{% block title -%} +Babysit: Cancelled +{% endblock %} + +{% block message %} +**Babysit mode has been cancelled** because new commits were pushed to the branch. + +Previous retries were for commit `{{ previous_commit[:7] }}`, but the current commit is `{{ current_commit[:7] }}`. + +If you want to enable automatic retries for the new commits, please comment `@{{ robot }} babysit` again. +{% endblock %} + diff --git a/bert_e/templates/babysit_exhausted.md b/bert_e/templates/babysit_exhausted.md new file mode 100644 index 00000000..dd7eaed8 --- /dev/null +++ b/bert_e/templates/babysit_exhausted.md @@ -0,0 +1,23 @@ +{% extends "message.md" %} + +{% block title -%} +Babysit: Maximum retries reached +{% endblock %} + +{% block message %} +The {% if build_url -%}[build]({{ build_url }}) {% else -%}build {% endif -%} +has exhausted all automatic retry attempts on branch `{{ branch.name }}`. + +**Exhausted workflows** ({{ max_retries }} retries each): +{% for wf in exhausted_workflows -%} +- `{{ wf }}` +{% endfor %} +To investigate: +- Review the [build logs]({{ build_url }}) for the failure cause +- Check if this is a flaky test or a genuine issue + +To get more retries: +- Fix the issue and push new commits (babysit will continue with fresh retries), or +- Comment `@{{ robot }} babysit` again to reset the retry counter +{% endblock %} + diff --git a/bert_e/templates/babysit_retry.md b/bert_e/templates/babysit_retry.md new file mode 100644 index 00000000..73eaaa82 --- /dev/null +++ b/bert_e/templates/babysit_retry.md @@ -0,0 +1,20 @@ +{% extends "message.md" %} + +{% block title -%} +Babysit: Retrying build +{% endblock %} + +{% block message %} +The {% if build_url -%}[build]({{ build_url }}) {% else -%}build {% endif -%} +failed on branch `{{ branch.name }}` (commit `{{ commit_sha[:7] }}`). + +**Babysit mode is active** - automatically retrying failed workflows: + +| Workflow | Retry | +|:---------|:-----:| +{% for wf in workflows -%} +| `{{ wf.name }}` | {{ wf.retry_count }}/{{ max_retries }} | +{% endfor %} +Please wait for the new build to complete. +{% endblock %} + diff --git a/bert_e/tests/unit/test_babysit.py b/bert_e/tests/unit/test_babysit.py new file mode 100644 index 00000000..038e5af0 --- /dev/null +++ b/bert_e/tests/unit/test_babysit.py @@ -0,0 +1,785 @@ +# Copyright 2016-2018 Scality +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for the babysit feature.""" + +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest + +from bert_e.exceptions import BabysitRetry, BabysitExhausted, BabysitCancelled +from bert_e.git_host.github import AggregatedWorkflowRuns, Client +from bert_e.workflow.gitwaterflow.babysit import ( + count_babysit_retries_per_workflow, handle_babysit_retry, + BABYSIT_RETRY_MARKER, COMMIT_SHA_PATTERN, WORKFLOW_RETRY_PATTERN +) +from bert_e.workflow.gitwaterflow.queueing import ( + _check_pr_babysit_enabled, _handle_queue_babysit_retry +) + + +@pytest.fixture +def client(): + return Client( + login='login', + password='password', + email='email@org.com', + base_url="http://localhost:4010", + accept_header="application/json" + ) + + +@pytest.fixture +def failed_workflow_run_json(): + """Workflow run JSON with a failed run on an integration branch (w/).""" + return { + 'workflow_runs': [ + { + 'id': 12345, + 'head_sha': 'd6fde92930d4715a2b49857d24b940956b26d2d3', + 'head_branch': 'w/5.0/feature/test', + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 1, + 'check_suite_id': 1, + 'conclusion': 'failure', + 'run_attempt': 2, + 'name': 'CI Build', + 'html_url': 'https://github.com/octo-org/Hello-World/actions/runs/12345', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + ], + 'total_count': 1 + } + + +@pytest.fixture +def successful_workflow_run_json(): + """Workflow run JSON with a successful run.""" + return { + 'workflow_runs': [ + { + 'id': 12345, + 'head_sha': 'd6fde92930d4715a2b49857d24b940956b26d2d3', + 'head_branch': 'w/5.0/feature/test', + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 1, + 'check_suite_id': 1, + 'conclusion': 'success', + 'run_attempt': 1, + 'name': 'CI Build', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + ], + 'total_count': 1 + } + + +class TestGetFailedRuns: + """Tests for AggregatedWorkflowRuns.get_failed_runs().""" + + def test_get_failed_runs_returns_failed_workflows( + self, client, failed_workflow_run_json): + """Test that get_failed_runs returns failed workflow runs.""" + workflow_runs = AggregatedWorkflowRuns( + client, **failed_workflow_run_json) + + failed_runs = workflow_runs.get_failed_runs() + + assert len(failed_runs) == 1 + assert failed_runs[0]['id'] == 12345 + assert failed_runs[0]['run_attempt'] == 2 + assert failed_runs[0]['workflow_id'] == 1 + assert failed_runs[0]['name'] == 'CI Build' + + def test_get_failed_runs_returns_empty_for_successful( + self, client, successful_workflow_run_json): + """Test that get_failed_runs returns empty for successful runs.""" + workflow_runs = AggregatedWorkflowRuns( + client, **successful_workflow_run_json) + + failed_runs = workflow_runs.get_failed_runs() + + assert len(failed_runs) == 0 + + def test_get_failed_runs_default_run_attempt(self, client): + """Test that run_attempt defaults to 1 if not present.""" + workflow_run_json = { + 'workflow_runs': [ + { + 'id': 99999, + 'head_sha': 'abc123', + 'head_branch': 'feature', + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 1, + 'check_suite_id': 1, + 'conclusion': 'failure', + # run_attempt not present + 'repository': { + 'full_name': 'org/repo', + 'owner': {'login': 'org'}, + 'name': 'repo' + } + }, + ], + 'total_count': 1 + } + workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json) + + failed_runs = workflow_runs.get_failed_runs() + + assert len(failed_runs) == 1 + assert failed_runs[0]['run_attempt'] == 1 # default + + +class TestPatterns: + """Tests for regex patterns.""" + + def test_commit_sha_pattern_matches_short_sha(self): + """Test matching a short commit SHA (7 chars).""" + text = 'branch `w/5.0/test` (commit `abc1234`)' + match = COMMIT_SHA_PATTERN.search(text) + assert match is not None + assert match.group(1) == 'abc1234' + + def test_commit_sha_pattern_matches_full_sha(self): + """Test matching a full commit SHA (40 chars).""" + full_sha = 'd6fde92930d4715a2b49857d24b940956b26d2d3' + text = f'(commit `{full_sha}`)' + match = COMMIT_SHA_PATTERN.search(text) + assert match is not None + assert match.group(1) == full_sha + + def test_commit_sha_pattern_no_match_without_backticks(self): + """Test that commit without backticks doesn't match.""" + text = '(commit abc1234)' + match = COMMIT_SHA_PATTERN.search(text) + assert match is None + + def test_workflow_retry_pattern_matches(self): + """Test that workflow retry pattern matches table rows.""" + text = '| `CI Build` | 2/5 |' + match = WORKFLOW_RETRY_PATTERN.search(text) + assert match is not None + assert match.group(1) == 'CI Build' + assert match.group(2) == '2' + assert match.group(3) == '5' + + def test_workflow_retry_pattern_matches_multiple(self): + """Test extracting multiple workflows from a table.""" + text = '''| Workflow | Retry | +|:---------|:-----:| +| `CI Build` | 1/5 | +| `Tests` | 3/5 | +| `Lint` | 2/5 | +''' + matches = list(WORKFLOW_RETRY_PATTERN.finditer(text)) + assert len(matches) == 3 + assert matches[0].group(1) == 'CI Build' + assert matches[1].group(1) == 'Tests' + assert matches[2].group(1) == 'Lint' + + +class TestCountBabysitRetriesPerWorkflow: + """Tests for count_babysit_retries_per_workflow function.""" + + def _make_comment(self, author, text): + """Create a mock comment.""" + comment = MagicMock() + comment.author = author + comment.text = text + return comment + + def _make_pr(self, comments): + """Create a mock PR with comments.""" + pr = MagicMock() + pr.comments = comments + return pr + + def _make_retry_comment(self, branch_name, commit, workflows): + """Create a BabysitRetry-like comment text.""" + lines = [ + BABYSIT_RETRY_MARKER, + f'failed on branch `{branch_name}` (commit `{commit[:7]}`)', + '| Workflow | Retry |', + '|:---------|:-----:|', + ] + for wf_name, retry_count, max_retries in workflows: + lines.append(f'| `{wf_name}` | {retry_count}/{max_retries} |') + return '\n'.join(lines) + + def test_no_comments_returns_empty(self): + """Test counting with no comments returns empty dict.""" + pr = self._make_pr([]) + retries, is_stale, prev = count_babysit_retries_per_workflow( + pr, 'bert-e', 'w/5.0/feature/test', 'abc1234567890') + assert retries == {} + assert is_stale is False + assert prev is None + + def test_counts_retries_per_workflow(self): + """Test counting retries per workflow from comments.""" + branch_name = 'w/5.0/feature/test' + commit = 'abc1234567890' + comments = [ + self._make_comment('user', '@bert-e babysit'), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', 1, 5), ('Tests', 1, 5)] + )), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', 2, 5)] # Only CI failed this time + )), + ] + pr = self._make_pr(comments) + retries, is_stale, prev = count_babysit_retries_per_workflow( + pr, 'bert-e', branch_name, commit) + + # CI Build was retried twice, Tests once + assert retries == {'CI Build': 2, 'Tests': 1} + assert is_stale is False + + def test_babysit_command_resets_all_counts(self): + """Test that a new /babysit command resets all workflow counts.""" + branch_name = 'w/5.0/feature/test' + commit = 'abc1234567890' + comments = [ + self._make_comment('user', '@bert-e babysit'), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', 1, 5), ('Tests', 1, 5)] + )), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', 2, 5), ('Tests', 2, 5)] + )), + # User re-invokes babysit + self._make_comment('user', '@bert-e babysit'), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', 1, 5)] + )), + ] + pr = self._make_pr(comments) + retries, is_stale, prev = count_babysit_retries_per_workflow( + pr, 'bert-e', branch_name, commit) + + # Only 1 retry for CI Build since the reset + assert retries == {'CI Build': 1} + assert is_stale is False + + def test_detects_stale_babysit(self): + """Test detection of stale babysit when commit changed.""" + branch_name = 'w/5.0/feature/test' + old_commit = 'abc1234567890' + new_commit = 'def9876543210' + comments = [ + self._make_comment('user', '@bert-e babysit'), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, old_commit, + [('CI Build', 1, 5)] + )), + ] + pr = self._make_pr(comments) + retries, is_stale, prev = count_babysit_retries_per_workflow( + pr, 'bert-e', branch_name, new_commit) + + assert retries == {'CI Build': 1} + assert is_stale is True + assert prev == old_commit[:7] + + def test_new_babysit_clears_stale(self): + """Test that re-invoking /babysit clears stale flag.""" + branch_name = 'w/5.0/feature/test' + old_commit = 'abc1234567890' + new_commit = 'def9876543210' + comments = [ + self._make_comment('user', '@bert-e babysit'), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, old_commit, + [('CI Build', 1, 5)] + )), + # User pushes new commit and re-invokes babysit + self._make_comment('user', '@bert-e babysit'), + ] + pr = self._make_pr(comments) + retries, is_stale, prev = count_babysit_retries_per_workflow( + pr, 'bert-e', branch_name, new_commit) + + assert retries == {} + assert is_stale is False + assert prev is None + + +class TestBabysitExceptions: + """Tests for BabysitRetry, BabysitExhausted, and BabysitCancelled.""" + + def test_babysit_retry_exception(self): + """Test BabysitRetry exception creation.""" + branch = SimpleNamespace(name='w/5.0/feature/test') + exc = BabysitRetry( + active_options=['babysit'], + branch=branch, + build_url='https://github.com/org/repo/actions/runs/123', + commit_sha='abc1234567890', + workflows=[ + {'id': 1, 'name': 'CI Build', 'retry_count': 2}, + {'id': 2, 'name': 'Tests', 'retry_count': 1}, + ], + max_retries=5, + ) + + assert exc.code == 140 + assert exc.status == "in_progress" + + def test_babysit_exhausted_exception(self): + """Test BabysitExhausted exception creation.""" + branch = SimpleNamespace(name='w/5.0/feature/test') + exc = BabysitExhausted( + active_options=['babysit'], + branch=branch, + build_url='https://github.com/org/repo/actions/runs/123', + max_retries=5, + robot='bert-e', + exhausted_workflows=['CI Build', 'Tests'], + ) + + assert exc.code == 141 + assert exc.status == "failure" + + def test_babysit_cancelled_exception(self): + """Test BabysitCancelled exception creation.""" + branch = SimpleNamespace(name='w/5.0/feature/test') + exc = BabysitCancelled( + active_options=['babysit'], + branch=branch, + previous_commit='abc1234567890', + current_commit='def9876543210', + robot='bert-e', + ) + + assert exc.code == 142 + assert exc.status == "in_progress" + + +class TestHandleBabysitRetry: + """Tests for handle_babysit_retry function.""" + + def _make_comment(self, author, text): + """Create a mock comment.""" + comment = MagicMock() + comment.author = author + comment.text = text + return comment + + def _make_retry_comment(self, branch_name, commit, workflows): + """Create a BabysitRetry-like comment text.""" + lines = [ + BABYSIT_RETRY_MARKER, + f'failed on branch `{branch_name}` (commit `{commit[:7]}`)', + '| Workflow | Retry |', + '|:---------|:-----:|', + ] + for wf_name, retry_count, max_retries in workflows: + lines.append(f'| `{wf_name}` | {retry_count}/{max_retries} |') + return '\n'.join(lines) + + def _make_job(self, babysit=False, host='github', build_key='github_actions', + max_retries=5, comments=None): + """Create a mock job with settings.""" + settings = SimpleNamespace( + babysit=babysit, + repository_host=host, + repository_owner='octo-org', + repository_slug='Hello-World', + max_babysit_retries=max_retries, + robot='bert-e', + ) + project_repo = MagicMock() + project_repo.get_build_url.return_value = 'https://example.com/build' + project_repo.rerun_failed_workflow_jobs = MagicMock() + + pull_request = MagicMock() + pull_request.comments = comments or [] + + job = SimpleNamespace( + settings=settings, + project_repo=project_repo, + active_options=['babysit'] if babysit else [], + pull_request=pull_request, + ) + return job + + def _make_branch(self, commit='abc1234567890', name='w/5.0/feature/test'): + """Create a mock branch.""" + branch = MagicMock() + branch.name = name + branch.get_latest_commit.return_value = commit + return branch + + def test_babysit_disabled_returns_false(self): + """Test that babysit logic is skipped when disabled.""" + job = self._make_job(babysit=False) + branch = self._make_branch() + + result = handle_babysit_retry(job, branch, 'github_actions') + + assert result is False + + def test_babysit_skips_non_github(self): + """Test that babysit is skipped for non-GitHub hosts.""" + job = self._make_job(babysit=True, host='bitbucket') + branch = self._make_branch() + + result = handle_babysit_retry(job, branch, 'github_actions') + + assert result is False + + def test_babysit_retry_per_workflow(self, client): + """Test that babysit tracks retries per workflow.""" + branch_name = 'w/5.0/feature/test' + commit = 'abc1234567890' + + # CI Build has 2 retries, Tests has 0 + comments = [ + self._make_comment('user', '@bert-e babysit'), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', 1, 5)] + )), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', 2, 5)] + )), + ] + job = self._make_job(babysit=True, max_retries=5, comments=comments) + branch = self._make_branch(commit=commit, name=branch_name) + + # Both workflows fail + workflow_run_json = { + 'workflow_runs': [ + { + 'id': 11111, + 'head_sha': commit, + 'head_branch': branch_name, + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 1, + 'check_suite_id': 1, + 'conclusion': 'failure', + 'run_attempt': 3, + 'name': 'CI Build', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + { + 'id': 22222, + 'head_sha': commit, + 'head_branch': branch_name, + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 2, + 'check_suite_id': 2, + 'conclusion': 'failure', + 'run_attempt': 1, + 'name': 'Tests', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + ], + 'total_count': 2 + } + workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json) + + with patch.object(AggregatedWorkflowRuns, 'get', + return_value=workflow_runs): + with pytest.raises(BabysitRetry) as exc_info: + handle_babysit_retry(job, branch, 'github_actions') + + # Both workflows should be retried + assert job.project_repo.rerun_failed_workflow_jobs.call_count == 2 + + # Check the workflows in the exception + workflows = exc_info.value.kwargs['workflows'] + wf_names = {wf['name'] for wf in workflows} + assert 'CI Build' in wf_names + assert 'Tests' in wf_names + + # CI Build should be at retry 3, Tests at retry 1 + ci_wf = next(wf for wf in workflows if wf['name'] == 'CI Build') + tests_wf = next(wf for wf in workflows if wf['name'] == 'Tests') + assert ci_wf['retry_count'] == 3 + assert tests_wf['retry_count'] == 1 + + def test_workflow_exhausted_individually(self, client): + """Test that workflows are exhausted individually.""" + branch_name = 'w/5.0/feature/test' + commit = 'abc1234567890' + + # CI Build has 5 retries (exhausted), Tests has 2 + comments = [ + self._make_comment('user', '@bert-e babysit'), + ] + for i in range(5): + comments.append(self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', i + 1, 5)] + ))) + for i in range(2): + comments.append(self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('Tests', i + 1, 5)] + ))) + + job = self._make_job(babysit=True, max_retries=5, comments=comments) + branch = self._make_branch(commit=commit, name=branch_name) + + # Both workflows fail + workflow_run_json = { + 'workflow_runs': [ + { + 'id': 11111, + 'head_sha': commit, + 'head_branch': branch_name, + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 1, + 'check_suite_id': 1, + 'conclusion': 'failure', + 'run_attempt': 6, + 'name': 'CI Build', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + { + 'id': 22222, + 'head_sha': commit, + 'head_branch': branch_name, + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 2, + 'check_suite_id': 2, + 'conclusion': 'failure', + 'run_attempt': 3, + 'name': 'Tests', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + ], + 'total_count': 2 + } + workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json) + + with patch.object(AggregatedWorkflowRuns, 'get', + return_value=workflow_runs): + with pytest.raises(BabysitRetry) as exc_info: + handle_babysit_retry(job, branch, 'github_actions') + + # Only Tests should be retried (CI Build is exhausted) + job.project_repo.rerun_failed_workflow_jobs.assert_called_once_with(22222) + + # Check that only Tests is in the retry list + workflows = exc_info.value.kwargs['workflows'] + assert len(workflows) == 1 + assert workflows[0]['name'] == 'Tests' + assert workflows[0]['retry_count'] == 3 + + def test_all_workflows_exhausted(self, client): + """Test that BabysitExhausted is raised when all workflows exhausted.""" + branch_name = 'w/5.0/feature/test' + commit = 'abc1234567890' + + # Both workflows have 5 retries (exhausted) + comments = [ + self._make_comment('user', '@bert-e babysit'), + ] + for i in range(5): + comments.append(self._make_comment('bert-e', self._make_retry_comment( + branch_name, commit, + [('CI Build', i + 1, 5), ('Tests', i + 1, 5)] + ))) + + job = self._make_job(babysit=True, max_retries=5, comments=comments) + branch = self._make_branch(commit=commit, name=branch_name) + + # Both workflows fail + workflow_run_json = { + 'workflow_runs': [ + { + 'id': 11111, + 'head_sha': commit, + 'head_branch': branch_name, + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 1, + 'check_suite_id': 1, + 'conclusion': 'failure', + 'run_attempt': 6, + 'name': 'CI Build', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + { + 'id': 22222, + 'head_sha': commit, + 'head_branch': branch_name, + 'status': 'completed', + 'event': 'pull_request', + 'workflow_id': 2, + 'check_suite_id': 2, + 'conclusion': 'failure', + 'run_attempt': 6, + 'name': 'Tests', + 'repository': { + 'full_name': 'octo-org/Hello-World', + 'owner': {'login': 'octo-org'}, + 'name': 'Hello-World' + } + }, + ], + 'total_count': 2 + } + workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json) + + with patch.object(AggregatedWorkflowRuns, 'get', + return_value=workflow_runs): + with pytest.raises(BabysitExhausted) as exc_info: + handle_babysit_retry(job, branch, 'github_actions') + + # No reruns should be called + job.project_repo.rerun_failed_workflow_jobs.assert_not_called() + + # Check exhausted workflows + exhausted = exc_info.value.kwargs['exhausted_workflows'] + assert 'CI Build' in exhausted + assert 'Tests' in exhausted + + def test_babysit_cancelled_on_new_commit(self, client): + """Test that babysit is cancelled when new commits are pushed.""" + branch_name = 'w/5.0/feature/test' + old_commit = 'abc1234567890' + new_commit = 'def9876543210' + + comments = [ + self._make_comment('user', '@bert-e babysit'), + self._make_comment('bert-e', self._make_retry_comment( + branch_name, old_commit, + [('CI Build', 1, 5)] + )), + ] + + job = self._make_job(babysit=True, max_retries=5, comments=comments) + branch = self._make_branch(commit=new_commit, name=branch_name) + + with pytest.raises(BabysitCancelled) as exc_info: + handle_babysit_retry(job, branch, 'github_actions') + + assert exc_info.value.kwargs['previous_commit'] == old_commit[:7] + assert exc_info.value.kwargs['current_commit'] == new_commit + + +class TestCheckPrBabysitEnabled: + """Tests for _check_pr_babysit_enabled function.""" + + def test_babysit_enabled_in_comments(self): + """Test detecting babysit option from PR comments.""" + comment = MagicMock() + comment.author = 'user' + comment.text = '@bert-e babysit' + + pull_request = MagicMock() + pull_request.comments = [comment] + pull_request.author = 'user' + + settings = SimpleNamespace( + robot='bert-e', + admins=[], + ) + + result = _check_pr_babysit_enabled(pull_request, settings) + assert result is True + + def test_babysit_not_enabled(self): + """Test when babysit is not in PR comments.""" + comment = MagicMock() + comment.author = 'user' + comment.text = '@bert-e approve' + + pull_request = MagicMock() + pull_request.comments = [comment] + pull_request.author = 'user' + + settings = SimpleNamespace( + robot='bert-e', + admins=[], + ) + + result = _check_pr_babysit_enabled(pull_request, settings) + assert result is False + + +class TestQueueBabysitRetry: + """Tests for queue babysit retry functionality.""" + + def test_queue_babysit_skips_non_github(self): + """Test queue babysit is skipped for non-GitHub hosts.""" + settings = SimpleNamespace( + repository_host='bitbucket', + max_babysit_retries=5, + ) + job = SimpleNamespace(settings=settings) + queues = MagicMock() + queues.build_key = 'github_actions' + + result = _handle_queue_babysit_retry(job, queues, [123]) + assert result is False + + def test_queue_babysit_skips_non_github_actions(self): + """Test queue babysit is skipped for non-github_actions build key.""" + settings = SimpleNamespace( + repository_host='github', + max_babysit_retries=5, + ) + job = SimpleNamespace(settings=settings) + queues = MagicMock() + queues.build_key = 'pre-merge' + + result = _handle_queue_babysit_retry(job, queues, [123]) + assert result is False diff --git a/bert_e/workflow/gitwaterflow/__init__.py b/bert_e/workflow/gitwaterflow/__init__.py index 7bda9d6c..3e02636c 100644 --- a/bert_e/workflow/gitwaterflow/__init__.py +++ b/bert_e/workflow/gitwaterflow/__init__.py @@ -628,6 +628,8 @@ def check_build_status(job, wbranches): BuildFailed: if a build failed or was stopped. BuildNotStarted: if a build hasn't started yet. BuildInProgress: if a build is still in progress. + BabysitRetry: if babysit mode is active and retrying failed jobs. + BabysitExhausted: if babysit mode exhausted all retries. """ @@ -651,6 +653,10 @@ def status(branch): worst = max(wbranches, key=lambda b: ordered_state[statuses[b.name]]) worst_status = statuses[worst.name] if worst_status in ('FAILED', 'STOPPED'): + # Check if babysit mode should handle the failure + if _handle_babysit_retry(job, worst, key): + return # Babysit handled the failure (raised an exception) + raise messages.BuildFailed( active_options=job.active_options, branch=worst, @@ -667,3 +673,7 @@ def status(branch): elif worst_status == 'INPROGRESS': raise messages.BuildInProgress() assert worst_status == 'SUCCESSFUL' + + +# Import the shared babysit module +from .babysit import handle_babysit_retry as _handle_babysit_retry diff --git a/bert_e/workflow/gitwaterflow/babysit.py b/bert_e/workflow/gitwaterflow/babysit.py new file mode 100644 index 00000000..887e6ced --- /dev/null +++ b/bert_e/workflow/gitwaterflow/babysit.py @@ -0,0 +1,290 @@ +# Copyright 2016-2026 Scality +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Babysit feature - automatic retry of failed GitHub Actions builds. + +This module provides the babysit functionality that automatically retries +failed GitHub Actions builds when the /babysit option is enabled on a +pull request. The feature: + +1. Monitors build failures on any branch (integration branches w/*, queue + branches, etc.) +2. Automatically triggers GitHub's "Re-run failed jobs" for failed workflow + runs +3. Tracks retry count PER WORKFLOW by parsing Bert-E's BabysitRetry comments + since the last /babysit command from the user +4. After max_babysit_retries (configurable, default 5) for a workflow, that + workflow is considered exhausted. When ALL failed workflows are exhausted, + posts a BabysitExhausted notification +5. Users can comment /babysit again to reset all retry counters and get + additional retries +6. If new commits are pushed, babysit is cancelled and must be re-invoked + +""" +import logging +import re +from collections import defaultdict + +from bert_e import exceptions as messages + + +LOG = logging.getLogger(__name__) + +# Marker text used to identify BabysitRetry comments from Bert-E +BABYSIT_RETRY_MARKER = "Babysit: Retrying build" + +# Regex to extract commit SHA from BabysitRetry comments +# Matches: (commit `abc1234`) where abc1234 is 7+ hex chars +COMMIT_SHA_PATTERN = re.compile(r'\(commit `([a-f0-9]{7,40})`\)') + +# Regex to extract workflow names from the retry table +# Matches: | `workflow_name` | X/Y | +WORKFLOW_RETRY_PATTERN = re.compile(r'\| `([^`]+)` \| (\d+)/(\d+) \|') + + +def count_babysit_retries_per_workflow(pull_request, robot_name, branch_name, + current_commit): + """Count how many babysit retries have been done for each workflow. + + This parses BabysitRetry comments posted by Bert-E for the specific branch + since the last /babysit command, and extracts retry counts per workflow. + + Also detects if babysit is "stale" - i.e., retries were done for a + different commit than the current one. + + Args: + pull_request: The pull request to check comments on. + robot_name: The robot's username (e.g., "bert-e"). + branch_name: The branch name to count retries for. + current_commit: The current commit SHA on the branch. + + Returns: + tuple: (workflow_retries, is_stale, previous_commit) + - workflow_retries: dict mapping workflow name to retry count + - is_stale: True if retries were for a different commit. + - previous_commit: The commit SHA from previous retries (if stale), + or None. + + """ + workflow_retries = defaultdict(int) + previous_commit = None + is_stale = False + + # Pattern to find /babysit command from users + babysit_cmd_pattern = re.compile( + r'@' + re.escape(robot_name) + r'\s+babysit\b', + re.IGNORECASE + ) + + for comment in pull_request.comments: + author = comment.author + text = comment.text + + if author == robot_name: + # Check if this is a BabysitRetry comment for our branch + if (BABYSIT_RETRY_MARKER in text and + f"`{branch_name}`" in text): + + # Extract commit SHA from the comment + sha_match = COMMIT_SHA_PATTERN.search(text) + if sha_match: + comment_commit = sha_match.group(1) + # Check if this retry was for a different commit + if not current_commit.startswith(comment_commit): + is_stale = True + previous_commit = comment_commit + + # Extract workflow retry counts from the table + for wf_match in WORKFLOW_RETRY_PATTERN.finditer(text): + workflow_name = wf_match.group(1) + # The retry count in the message is the current retry number + # We just need to track that this workflow was retried + workflow_retries[workflow_name] += 1 + else: + # Check if user sent /babysit command - this resets all counts + if babysit_cmd_pattern.search(text): + workflow_retries = defaultdict(int) + is_stale = False + previous_commit = None + + return dict(workflow_retries), is_stale, previous_commit + + +def handle_babysit_retry(job, failed_branch, build_key, pull_request=None): + """Handle babysit retry logic for failed builds. + + This function is called when a build fails and the babysit option is + enabled. It will automatically retry the failed GitHub Actions jobs + up to max_babysit_retries times PER WORKFLOW. + + The retry count is tracked per workflow by parsing Bert-E's BabysitRetry + comments since the last /babysit command. This allows users to re-invoke + /babysit to get additional retries after exhaustion. + + If new commits are pushed after babysit was invoked, babysit is + cancelled and the user must re-invoke it. + + Args: + job: The current job. + failed_branch: The branch with the failed build (integration branch, + queue branch, or any branch). + build_key: The build key being checked (must be 'github_actions'). + pull_request: Optional pull request to check for comments. If not + provided, uses job.pull_request. + + Returns: + True if babysit handled the failure (always raises an exception). + False if babysit is not applicable. + + Raises: + BabysitRetry: if retrying the failed jobs. + BabysitExhausted: if max retries reached for all failed workflows. + BabysitCancelled: if new commits were pushed since babysit was invoked. + + """ + # Use the provided pull_request or fall back to job's pull_request + pr = pull_request or getattr(job, 'pull_request', None) + if pr is None: + LOG.debug("Babysit: no pull request available") + return False + + # Check if babysit is enabled + if not job.settings.babysit: + return False + + # Babysit only works for GitHub with github_actions build key + if job.settings.repository_host != 'github': + LOG.debug("Babysit: skipping, not GitHub host") + return False + + if build_key != 'github_actions': + LOG.debug("Babysit: skipping, build_key is not github_actions") + return False + + branch_name = failed_branch.name + commit_sha = failed_branch.get_latest_commit() + max_retries = job.settings.max_babysit_retries + + LOG.info("Babysit: checking failed build on branch %s (commit %s)", + branch_name, commit_sha[:7]) + + # Count existing retries per workflow and check for stale babysit + workflow_retries, is_stale, previous_commit = \ + count_babysit_retries_per_workflow( + pr, job.settings.robot, branch_name, commit_sha + ) + + LOG.info("Babysit: branch=%s, workflow_retries=%s, is_stale=%s", + branch_name, workflow_retries, is_stale) + + build_url = job.project_repo.get_build_url(commit_sha, build_key) + + # Check if babysit is stale (new commits pushed since babysit was invoked) + if is_stale and previous_commit: + LOG.info("Babysit: cancelled for %s due to new commits " + "(was: %s, now: %s)", + branch_name, previous_commit, commit_sha[:7]) + raise messages.BabysitCancelled( + active_options=job.active_options, + branch=failed_branch, + previous_commit=previous_commit, + current_commit=commit_sha, + robot=job.settings.robot, + ) + + # Get the workflow runs for the failed commit + from bert_e.git_host.github import AggregatedWorkflowRuns + + try: + workflow_runs = AggregatedWorkflowRuns.get( + client=job.project_repo.client, + owner=job.settings.repository_owner, + repo=job.settings.repository_slug, + params={'head_sha': commit_sha} + ) + except Exception as err: + LOG.warning("Babysit: failed to get workflow runs for %s: %s", + branch_name, err) + return False + + # Get failed runs + failed_runs = workflow_runs.get_failed_runs() + if not failed_runs: + LOG.debug("Babysit: no failed workflow runs found for %s", branch_name) + return False + + # Categorize workflows: which can be retried, which are exhausted + workflows_to_retry = [] + exhausted_workflows = [] + + for run in failed_runs: + workflow_name = run.get('name', f"workflow_{run['id']}") + current_count = workflow_retries.get(workflow_name, 0) + + if current_count >= max_retries: + LOG.info("Babysit: workflow '%s' exhausted (%d/%d)", + workflow_name, current_count, max_retries) + exhausted_workflows.append(workflow_name) + else: + workflows_to_retry.append({ + 'id': run['id'], + 'name': workflow_name, + 'retry_count': current_count + 1, # This will be the new count + }) + + # If all failed workflows are exhausted, raise BabysitExhausted + if not workflows_to_retry and exhausted_workflows: + LOG.info("Babysit: all workflows exhausted for %s: %s", + branch_name, exhausted_workflows) + raise messages.BabysitExhausted( + active_options=job.active_options, + branch=failed_branch, + build_url=build_url, + max_retries=max_retries, + robot=job.settings.robot, + exhausted_workflows=exhausted_workflows, + ) + + # If no workflows to retry (but also none exhausted), something is off + if not workflows_to_retry: + LOG.warning("Babysit: no workflows to retry and none exhausted for %s", + branch_name) + return False + + # Trigger re-run of each workflow that hasn't exhausted retries + rerun_triggered = False + for wf in workflows_to_retry: + try: + LOG.info("Babysit: re-running failed jobs for workflow '%s' " + "(id=%d) on %s, retry %d/%d", + wf['name'], wf['id'], branch_name, + wf['retry_count'], max_retries) + job.project_repo.rerun_failed_workflow_jobs(wf['id']) + rerun_triggered = True + except Exception as err: + LOG.warning("Babysit: failed to rerun workflow %d (%s) on %s: %s", + wf['id'], wf['name'], branch_name, err) + + if not rerun_triggered: + LOG.warning("Babysit: could not trigger any reruns for %s", branch_name) + return False + + # Raise BabysitRetry with per-workflow information + raise messages.BabysitRetry( + active_options=job.active_options, + branch=failed_branch, + build_url=build_url, + commit_sha=commit_sha, + workflows=workflows_to_retry, + max_retries=max_retries, + ) diff --git a/bert_e/workflow/gitwaterflow/commands.py b/bert_e/workflow/gitwaterflow/commands.py index eb5b5bc2..7526b5b3 100644 --- a/bert_e/workflow/gitwaterflow/commands.py +++ b/bert_e/workflow/gitwaterflow/commands.py @@ -228,3 +228,8 @@ def setup(defaults={}): "wait", "Instruct Bert-E not to run until further notice.", default=defaults.get("wait", False)) + Reactor.add_option( + "babysit", + "Automatically retry failed GitHub Actions builds.", + privileged=False, + default=defaults.get("babysit", False)) diff --git a/bert_e/workflow/gitwaterflow/queueing.py b/bert_e/workflow/gitwaterflow/queueing.py index ccc4afab..22b5b5bb 100644 --- a/bert_e/workflow/gitwaterflow/queueing.py +++ b/bert_e/workflow/gitwaterflow/queueing.py @@ -15,11 +15,13 @@ import logging from copy import deepcopy +from types import SimpleNamespace from bert_e import exceptions from bert_e.job import handler as job_handler from bert_e.job import QueuesJob, PullRequestJob from bert_e.lib import git +from bert_e.reactor import Reactor from ..git_utils import clone_git_repo, consecutive_merge, robust_merge, push from ..pr_utils import notify_user @@ -28,12 +30,140 @@ QueueIntegrationBranch, branch_factory, build_queue_collection) from .integration import get_integration_branches -from typing import List +from typing import List, Dict, Any LOG = logging.getLogger(__name__) +def _check_pr_babysit_enabled(pull_request, settings) -> bool: + """Check if the babysit option is enabled for a pull request. + + Args: + pull_request: The pull request to check. + settings: The bot settings. + + Returns: + True if babysit is enabled, False otherwise. + """ + # Create a temporary job-like object to parse options + temp_job = SimpleNamespace(settings={}) + reactor = Reactor() + reactor.init_settings(temp_job) + + prefix = '@{}'.format(settings.robot) + admins = settings.admins + + # Parse options from comments (ignore errors, just check for babysit) + for comment in pull_request.comments: + author = comment.author + privileged = author in admins + authored = author == pull_request.author + text = comment.text + try: + reactor.handle_options(temp_job, text, prefix, privileged, authored) + except Exception: + # Ignore errors, we just want to check for babysit + pass + + return temp_job.settings.get('babysit', False) + + +def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection, + failed_prs: List[int]) -> bool: + """Handle babysit retry logic for failed queue builds. + + For each failed PR in the queue that has babysit enabled, this function + will use the shared babysit logic to retry the failed GitHub Actions jobs + on the queue branch. + + The retry counting is done by counting Bert-E's BabysitRetry comments + for each specific branch since the last /babysit command from the user. + This allows users to comment /babysit again to get additional retries. + + Args: + job: The queue job. + queues: The queue collection with build status info. + failed_prs: List of PR IDs with failed builds. + + Returns: + True if any retries were triggered. + False if no babysit retries were applicable. + """ + from .babysit import handle_babysit_retry + from ..pr_utils import notify_user + + # Babysit only works for GitHub with github_actions build key + if job.settings.repository_host != 'github': + LOG.debug("Queue babysit: skipping, not GitHub host") + return False + + if queues.build_key != 'github_actions': + LOG.debug("Queue babysit: skipping, build_key is not github_actions") + return False + + retried_any = False + + for pr_id in failed_prs: + try: + pull_request = job.project_repo.get_pull_request(pr_id) + except Exception as err: + LOG.warning("Queue babysit: failed to get PR %d: %s", pr_id, err) + continue + + # Check if this PR has babysit enabled + if not _check_pr_babysit_enabled(pull_request, job.settings): + LOG.debug("Queue babysit: PR %d does not have babysit enabled", + pr_id) + continue + + # Find the queue integration branches for this PR + for version in queues._queues.keys(): + qints = queues._queues[version][QueueIntegrationBranch] + for qint in qints: + if qint.pr_id != pr_id: + continue + + # Check build status on this queue branch + commit_sha = qint.get_latest_commit() + status = queues.bbrepo.get_build_status(commit_sha, + queues.build_key) + if status != 'FAILED': + continue + + LOG.info("Queue babysit: checking failed build on %s for PR %d", + qint.name, pr_id) + + # Create a temporary job-like object for the shared babysit logic + temp_job = SimpleNamespace( + settings=job.settings, + project_repo=job.project_repo, + active_options=['babysit'], + pull_request=pull_request, + ) + + try: + # Use the shared babysit logic + handle_babysit_retry( + temp_job, qint, queues.build_key, + pull_request=pull_request + ) + except exceptions.BabysitRetry as retry_exc: + # Notify the PR about the retry + notify_user(job.settings, pull_request, retry_exc) + retried_any = True + except exceptions.BabysitExhausted as exhausted_exc: + # Notify the PR about exhaustion + notify_user(job.settings, pull_request, exhausted_exc) + retried_any = True # We handled it, just not with a retry + except exceptions.BabysitCancelled as cancelled_exc: + # Notify the PR that babysit was cancelled due to new commits + notify_user(job.settings, pull_request, cancelled_exc) + # Don't set retried_any - let normal failure handling proceed + + return retried_any + + def notify_queue_build_failed(failed_prs: List[int], job: QueuesJob): """Notify on the pull request that the queue build failed.""" # TODO: As this feature evolves, we might want to include @@ -73,6 +203,11 @@ def handle_merge_queues(job): if not failed_prs: raise exceptions.NothingToDo() else: + # Check if babysit should handle the failed queue builds + if _handle_queue_babysit_retry(job, queues, failed_prs): + # Babysit triggered retries, raise BuildInProgress to wait + raise exceptions.BuildInProgress() + notify_queue_build_failed(failed_prs, job) raise exceptions.QueueBuildFailed() From 608657bdc9a421486113a7d6b661d25f5aa5dcbd Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Watenberg Date: Mon, 12 Jan 2026 10:36:12 +0100 Subject: [PATCH 2/2] Fix linting and tests --- bert_e/git_host/github/__init__.py | 3 +- bert_e/tests/unit/test_babysit.py | 52 +++++++++++++++--------- bert_e/workflow/gitwaterflow/__init__.py | 3 +- bert_e/workflow/gitwaterflow/babysit.py | 11 ++--- bert_e/workflow/gitwaterflow/queueing.py | 22 ++++++---- 5 files changed, 55 insertions(+), 36 deletions(-) diff --git a/bert_e/git_host/github/__init__.py b/bert_e/git_host/github/__init__.py index 767cb5b8..df506627 100644 --- a/bert_e/git_host/github/__init__.py +++ b/bert_e/git_host/github/__init__.py @@ -525,7 +525,8 @@ def rerun_failed_workflow_jobs(self, run_id: int) -> None: run_id: The ID of the workflow run to re-run failed jobs for. """ - url = f'/repos/{self.owner}/{self.slug}/actions/runs/{run_id}/rerun-failed-jobs' + url = (f'/repos/{self.owner}/{self.slug}/actions/runs/' + f'{run_id}/rerun-failed-jobs') self.client.post(url, data='{}') diff --git a/bert_e/tests/unit/test_babysit.py b/bert_e/tests/unit/test_babysit.py index 038e5af0..9b2a91b2 100644 --- a/bert_e/tests/unit/test_babysit.py +++ b/bert_e/tests/unit/test_babysit.py @@ -27,6 +27,11 @@ from bert_e.workflow.gitwaterflow.queueing import ( _check_pr_babysit_enabled, _handle_queue_babysit_retry ) +# Import setup to register reactor options +from bert_e.workflow.gitwaterflow.commands import setup as gwf_setup + +# Call setup to register all options including babysit +gwf_setup() @pytest.fixture @@ -42,7 +47,7 @@ def client(): @pytest.fixture def failed_workflow_run_json(): - """Workflow run JSON with a failed run on an integration branch (w/).""" + """Workflow run JSON with failed run on an integration branch.""" return { 'workflow_runs': [ { @@ -56,7 +61,7 @@ def failed_workflow_run_json(): 'conclusion': 'failure', 'run_attempt': 2, 'name': 'CI Build', - 'html_url': 'https://github.com/octo-org/Hello-World/actions/runs/12345', + 'html_url': 'https://github.com/org/repo/actions/runs/12345', 'repository': { 'full_name': 'octo-org/Hello-World', 'owner': {'login': 'octo-org'}, @@ -406,8 +411,8 @@ def _make_retry_comment(self, branch_name, commit, workflows): lines.append(f'| `{wf_name}` | {retry_count}/{max_retries} |') return '\n'.join(lines) - def _make_job(self, babysit=False, host='github', build_key='github_actions', - max_retries=5, comments=None): + def _make_job(self, babysit=False, host='github', + build_key='github_actions', max_retries=5, comments=None): """Create a mock job with settings.""" settings = SimpleNamespace( babysit=babysit, @@ -549,15 +554,19 @@ def test_workflow_exhausted_individually(self, client): self._make_comment('user', '@bert-e babysit'), ] for i in range(5): - comments.append(self._make_comment('bert-e', self._make_retry_comment( - branch_name, commit, - [('CI Build', i + 1, 5)] - ))) + comments.append(self._make_comment( + 'bert-e', + self._make_retry_comment( + branch_name, commit, [('CI Build', i + 1, 5)] + ) + )) for i in range(2): - comments.append(self._make_comment('bert-e', self._make_retry_comment( - branch_name, commit, - [('Tests', i + 1, 5)] - ))) + comments.append(self._make_comment( + 'bert-e', + self._make_retry_comment( + branch_name, commit, [('Tests', i + 1, 5)] + ) + )) job = self._make_job(babysit=True, max_retries=5, comments=comments) branch = self._make_branch(commit=commit, name=branch_name) @@ -610,7 +619,9 @@ def test_workflow_exhausted_individually(self, client): handle_babysit_retry(job, branch, 'github_actions') # Only Tests should be retried (CI Build is exhausted) - job.project_repo.rerun_failed_workflow_jobs.assert_called_once_with(22222) + job.project_repo.rerun_failed_workflow_jobs.assert_called_once_with( + 22222 + ) # Check that only Tests is in the retry list workflows = exc_info.value.kwargs['workflows'] @@ -619,7 +630,7 @@ def test_workflow_exhausted_individually(self, client): assert workflows[0]['retry_count'] == 3 def test_all_workflows_exhausted(self, client): - """Test that BabysitExhausted is raised when all workflows exhausted.""" + """Test BabysitExhausted raised when all workflows exhausted.""" branch_name = 'w/5.0/feature/test' commit = 'abc1234567890' @@ -628,10 +639,13 @@ def test_all_workflows_exhausted(self, client): self._make_comment('user', '@bert-e babysit'), ] for i in range(5): - comments.append(self._make_comment('bert-e', self._make_retry_comment( - branch_name, commit, - [('CI Build', i + 1, 5), ('Tests', i + 1, 5)] - ))) + comments.append(self._make_comment( + 'bert-e', + self._make_retry_comment( + branch_name, commit, + [('CI Build', i + 1, 5), ('Tests', i + 1, 5)] + ) + )) job = self._make_job(babysit=True, max_retries=5, comments=comments) branch = self._make_branch(commit=commit, name=branch_name) @@ -772,7 +786,7 @@ def test_queue_babysit_skips_non_github(self): assert result is False def test_queue_babysit_skips_non_github_actions(self): - """Test queue babysit is skipped for non-github_actions build key.""" + """Test queue babysit skipped for non-github_actions build key.""" settings = SimpleNamespace( repository_host='github', max_babysit_retries=5, diff --git a/bert_e/workflow/gitwaterflow/__init__.py b/bert_e/workflow/gitwaterflow/__init__.py index 3e02636c..3adf3691 100644 --- a/bert_e/workflow/gitwaterflow/__init__.py +++ b/bert_e/workflow/gitwaterflow/__init__.py @@ -42,6 +42,7 @@ update_integration_branches) from .jira import jira_checks from . import queueing +from .babysit import handle_babysit_retry as _handle_babysit_retry LOG = logging.getLogger(__name__) @@ -675,5 +676,3 @@ def status(branch): assert worst_status == 'SUCCESSFUL' -# Import the shared babysit module -from .babysit import handle_babysit_retry as _handle_babysit_retry diff --git a/bert_e/workflow/gitwaterflow/babysit.py b/bert_e/workflow/gitwaterflow/babysit.py index 887e6ced..0ea160dd 100644 --- a/bert_e/workflow/gitwaterflow/babysit.py +++ b/bert_e/workflow/gitwaterflow/babysit.py @@ -52,8 +52,8 @@ WORKFLOW_RETRY_PATTERN = re.compile(r'\| `([^`]+)` \| (\d+)/(\d+) \|') -def count_babysit_retries_per_workflow(pull_request, robot_name, branch_name, - current_commit): +def count_babysit_retries_per_workflow(pull_request, robot_name, + branch_name, current_commit): """Count how many babysit retries have been done for each workflow. This parses BabysitRetry comments posted by Bert-E for the specific branch @@ -107,8 +107,8 @@ def count_babysit_retries_per_workflow(pull_request, robot_name, branch_name, # Extract workflow retry counts from the table for wf_match in WORKFLOW_RETRY_PATTERN.finditer(text): workflow_name = wf_match.group(1) - # The retry count in the message is the current retry number - # We just need to track that this workflow was retried + # The retry count in the message is the current number + # We just track that this workflow was retried workflow_retries[workflow_name] += 1 else: # Check if user sent /babysit command - this resets all counts @@ -276,7 +276,8 @@ def handle_babysit_retry(job, failed_branch, build_key, pull_request=None): wf['id'], wf['name'], branch_name, err) if not rerun_triggered: - LOG.warning("Babysit: could not trigger any reruns for %s", branch_name) + LOG.warning("Babysit: could not trigger any reruns for %s", + branch_name) return False # Raise BabysitRetry with per-workflow information diff --git a/bert_e/workflow/gitwaterflow/queueing.py b/bert_e/workflow/gitwaterflow/queueing.py index 22b5b5bb..902e2424 100644 --- a/bert_e/workflow/gitwaterflow/queueing.py +++ b/bert_e/workflow/gitwaterflow/queueing.py @@ -30,7 +30,7 @@ QueueIntegrationBranch, branch_factory, build_queue_collection) from .integration import get_integration_branches -from typing import List, Dict, Any +from typing import List LOG = logging.getLogger(__name__) @@ -61,7 +61,8 @@ def _check_pr_babysit_enabled(pull_request, settings) -> bool: authored = author == pull_request.author text = comment.text try: - reactor.handle_options(temp_job, text, prefix, privileged, authored) + reactor.handle_options( + temp_job, text, prefix, privileged, authored) except Exception: # Ignore errors, we just want to check for babysit pass @@ -69,8 +70,9 @@ def _check_pr_babysit_enabled(pull_request, settings) -> bool: return temp_job.settings.get('babysit', False) -def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection, - failed_prs: List[int]) -> bool: +def _handle_queue_babysit_retry(job: QueuesJob, + queues: QueueCollection, + failed_prs: List[int]) -> bool: """Handle babysit retry logic for failed queue builds. For each failed PR in the queue that has babysit enabled, this function @@ -131,8 +133,9 @@ def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection, if status != 'FAILED': continue - LOG.info("Queue babysit: checking failed build on %s for PR %d", - qint.name, pr_id) + LOG.info( + "Queue babysit: checking failed build on %s for PR %d", + qint.name, pr_id) # Create a temporary job-like object for the shared babysit logic temp_job = SimpleNamespace( @@ -155,11 +158,12 @@ def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection, except exceptions.BabysitExhausted as exhausted_exc: # Notify the PR about exhaustion notify_user(job.settings, pull_request, exhausted_exc) - retried_any = True # We handled it, just not with a retry + # We handled it, just not with a retry + retried_any = True except exceptions.BabysitCancelled as cancelled_exc: - # Notify the PR that babysit was cancelled due to new commits + # Babysit cancelled due to new commits notify_user(job.settings, pull_request, cancelled_exc) - # Don't set retried_any - let normal failure handling proceed + # Don't set retried_any - normal failure handling return retried_any