From f2a09a3473273f3f129735a862d7c922131192a3 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Watenberg <jean-baptiste.watenberg@scality.com>
Date: Mon, 12 Jan 2026 09:36:16 +0100
Subject: [PATCH 1/2] Add babysit feature for automatic retries of failed
 GitHub Actions

- Introduced `max_babysit_retries` setting in `settings.py` to configure the maximum number of retries (default is 5).
- Added `babysit` option in command setup to enable automatic retries for failed builds.
- Implemented new exceptions: `BabysitRetry`, `BabysitExhausted`, and `BabysitCancelled` in `exceptions.py` to handle different states of the babysit process.
- Enhanced `Repository` class with a method to rerun failed workflow jobs.
- Updated documentation to include details on the babysit feature and its usage.
- Implemented logic to check and handle babysit retries in queue processing.

This feature aims to reduce manual intervention for transient build failures in GitHub Actions.
---
 bert_e/docs/USER_DOC.md                  |  94 +++
 bert_e/exceptions.py                     |  22 +
 bert_e/git_host/github/__init__.py       |  38 ++
 bert_e/git_host/github/schema.py         |   3 +
 bert_e/settings.py                       |   3 +
 bert_e/templates/babysit_cancelled.md    |  14 +
 bert_e/templates/babysit_exhausted.md    |  23 +
 bert_e/templates/babysit_retry.md        |  20 +
 bert_e/tests/unit/test_babysit.py        | 785 +++++++++++++++++++++++
 bert_e/workflow/gitwaterflow/__init__.py |  10 +
 bert_e/workflow/gitwaterflow/babysit.py  | 290 +++++++++
 bert_e/workflow/gitwaterflow/commands.py |   5 +
 bert_e/workflow/gitwaterflow/queueing.py | 137 +++-
 13 files changed, 1443 insertions(+), 1 deletion(-)
 create mode 100644 bert_e/templates/babysit_cancelled.md
 create mode 100644 bert_e/templates/babysit_exhausted.md
 create mode 100644 bert_e/templates/babysit_retry.md
 create mode 100644 bert_e/tests/unit/test_babysit.py
 create mode 100644 bert_e/workflow/gitwaterflow/babysit.py

diff --git a/bert_e/docs/USER_DOC.md b/bert_e/docs/USER_DOC.md
index 54635034..4fd0a352 100644
--- a/bert_e/docs/USER_DOC.md
+++ b/bert_e/docs/USER_DOC.md
@@ -112,6 +112,7 @@ __Bert-E__.
 | options name              | description              | requires admin rights? | requires pull request author? |
 |:------------------------- |:------------------------ |:----------------------:|:-----------------------------:|
 | after_pull_request        | Wait for the given pull request id to be merged before continuing with the current one. May be used like this: @bert-e after_pull_request=< pr_id_1 > ... | no | no
+| babysit                   | Automatically retry failed GitHub Actions builds (see [Babysit](#babysit) section for details) | no | no
 | bypass_author_approval    | Bypass the pull request author's approval   | yes | no
 | bypass_build_status       | Bypass the build and test status| yes | no
 | bypass_incompatible_branch | Bypass the check on the source branch prefix | yes | no
@@ -482,6 +483,9 @@ to progress to the next step.  message code
 | 122   | Unknown command | One of the participants asked __Bert-E__ to activate an option, or execute a command he doesn't know. Edit the corresponding message if it contains a typo. Delete it otherwise
 | 123   | Not authorized | One of the participants asked __Bert-E__ to activate a privileged option, or execute a privileged command, but doesn't have enough credentials to do so. Delete the corresponding command ask a __Bert-E__ administrator to run/set the desired command/option.
 | 134   | Not author | One of the participants asked __Bert-E__ to activate an authored option, but the participant is not the author of the pull request.
+| 140   | Babysit: Retrying build | __Bert-E__ is automatically retrying failed GitHub Actions jobs because the babysit option is enabled. No action required - wait for the new build to complete.
+| 141   | Babysit: Maximum retries reached | __Bert-E__ has exhausted all automatic retry attempts. Investigate the build failure. To get more retries, comment `@bert-e babysit` again.
+| 142   | Babysit: Cancelled | __Bert-E__ cancelled the babysit option because new commits were pushed. To re-enable automatic retries, comment `@bert-e babysit` again.
 
 Queues
 ------
@@ -562,6 +566,96 @@ All those states can be found on Bert-E's UI.
 > Note: Bert-E will not notify the user if a build
 fails inside the queue.
 
+Babysit
+-------
+
+__The babysit option enables automatic retry of failed GitHub Actions builds.__
+
+When working with GitHub Actions, builds can sometimes fail due to flaky tests,
+transient infrastructure issues, or other temporary problems. The `babysit`
+option allows __Bert-E__ to automatically retry failed workflow runs, reducing
+the need for manual intervention.
+
+### Enabling Babysit
+
+To enable babysit on a pull request, comment:
+
+    @bert-e babysit
+
+### How It Works
+
+When babysit is enabled and a build fails:
+
+1. __Bert-E__ detects the failed GitHub Actions workflow runs
+2. For each failed workflow, __Bert-E__ triggers GitHub's "Re-run failed jobs"
+3. __Bert-E__ posts a comment indicating the retry attempt
+4. This process repeats until the build succeeds or the maximum retry limit
+   is reached
+
+### Scope of Babysit
+
+The babysit behavior applies to:
+
+* **Integration branches** (`w/x.y/...`): Failed builds on integration branches
+  are automatically retried
+* **Queue branches** (`q/...`): Failed builds in the merge queue are also
+  retried if babysit was enabled on the corresponding pull request
+* **All workflow runs individually**: Each GitHub Actions workflow is tracked
+  and retried independently. If you have multiple workflows (e.g., CI, Tests,
+  Lint), each one has its own retry counter. This means:
+  - If CI fails 5 times but Tests only fails twice, CI is exhausted while
+    Tests can still be retried 3 more times
+  - Only workflows that haven't reached their retry limit are retried
+  - __Bert-E__ shows a table with each workflow's retry count in the comments
+
+### Maximum Retries
+
+By default, __Bert-E__ will retry failed builds up to **5 times**. After the
+maximum number of retries is reached, __Bert-E__ posts a `BabysitExhausted`
+message indicating that automatic retries have been exhausted.
+
+This limit can be configured per repository by setting the
+`max_babysit_retries` parameter in the repository's __Bert-E__ configuration:
+
+```yaml
+max_babysit_retries: 10  # Allow up to 10 retries instead of the default 5
+```
+
+### Re-enabling Babysit After Exhaustion
+
+If the maximum retries have been exhausted but you want to continue with
+automatic retries, simply comment `@bert-e babysit` again. This resets the
+retry counter and allows for another round of automatic retries.
+
+### Babysit Cancellation on New Commits
+
+**Important:** If you push new commits to your branch after enabling babysit,
+the babysit option is automatically cancelled. This prevents stale retry
+attempts from continuing on outdated code.
+
+When this happens, __Bert-E__ will post a `BabysitCancelled` message explaining
+that new commits were detected. To re-enable automatic retries for the new
+commits, you must comment `@bert-e babysit` again.
+
+> **Example workflow:**
+>
+> 1. You comment `@bert-e babysit`
+> 2. Build fails, __Bert-E__ retries (attempt 1/5)
+> 3. Build fails again, __Bert-E__ retries (attempt 2/5)
+> 4. You push a fix to address the build failure
+> 5. __Bert-E__ detects the new commit and cancels babysit
+> 6. Build fails on the new commit
+> 7. You comment `@bert-e babysit` again to enable retries for the new code
+> 8. __Bert-E__ retries (attempt 1/5 - counter is reset)
+
+### Limitations
+
+* Babysit only works with **GitHub Actions** (`build_key: github_actions`)
+* Babysit is not available for other CI systems (Bitbucket Pipelines, Jenkins,
+  etc.)
+* Babysit does not bypass build failures - if the issue is not transient, the
+  build will continue to fail after all retries are exhausted
+
 Going further with __Bert-E__
 -----------------------------
 Do you like __Bert-E__? Would like to use it on your own projects?
diff --git a/bert_e/exceptions.py b/bert_e/exceptions.py
index fb93bdcf..6fbb1e14 100644
--- a/bert_e/exceptions.py
+++ b/bert_e/exceptions.py
@@ -585,3 +585,25 @@ class JobFailure(SilentException):
 
 class QueueBuildFailed(SilentException):
     code = 309
+
+
+class BabysitRetry(TemplateException):
+    """Raised when babysit mode triggers a retry of failed GitHub Actions."""
+    code = 140
+    template = 'babysit_retry.md'
+    dont_repeat_if_in_history = 0  # allow repeating for each retry
+    status = "in_progress"
+
+
+class BabysitExhausted(TemplateException):
+    """Raised when babysit mode has exhausted all retry attempts."""
+    code = 141
+    template = 'babysit_exhausted.md'
+    status = "failure"
+
+
+class BabysitCancelled(TemplateException):
+    """Raised when babysit mode is cancelled due to new commits."""
+    code = 142
+    template = 'babysit_cancelled.md'
+    status = "in_progress"
diff --git a/bert_e/git_host/github/__init__.py b/bert_e/git_host/github/__init__.py
index c6daf27b..767cb5b8 100644
--- a/bert_e/git_host/github/__init__.py
+++ b/bert_e/git_host/github/__init__.py
@@ -518,6 +518,16 @@ def create_pull_request(self, title, src_branch, dst_branch, description,
         return PullRequest.create(self.client, data=kwargs, owner=self.owner,
                                   repo=self.slug)
 
+    def rerun_failed_workflow_jobs(self, run_id: int) -> None:
+        """Re-run only the failed jobs of a workflow run.
+
+        Args:
+            run_id: The ID of the workflow run to re-run failed jobs for.
+
+        """
+        url = f'/repos/{self.owner}/{self.slug}/actions/runs/{run_id}/rerun-failed-jobs'
+        self.client.post(url, data='{}')
+
 
 class AggregatedStatus(base.AbstractGitHostObject):
     GET_URL = '/repos/{owner}/{repo}/commits/{ref}/status'
@@ -640,6 +650,34 @@ def branch(self) -> str | None:
             return self._workflow_runs[0]['head_branch']
         return None
 
+    def get_failed_runs(self):
+        """Get workflow runs that have failed.
+
+        This method filters workflow runs to keep only the most relevant run
+        per workflow (same logic as remove_unwanted_workflows), then returns
+        those that have failed.
+
+        Returns:
+            List of dicts with 'id' and 'run_attempt' for each failed run.
+        """
+        # First, filter to get the best run per workflow (same as state check)
+        self.remove_unwanted_workflows()
+
+        failed_runs = []
+        for run in self._workflow_runs:
+            if run.get('conclusion') == 'failure':
+                failed_runs.append({
+                    'id': run['id'],
+                    'run_attempt': run.get('run_attempt', 1),
+                    'workflow_id': run.get('workflow_id'),
+                    'name': run.get('name', 'unknown'),
+                    'html_url': run.get('html_url', ''),
+                })
+                LOG.debug(
+                    "Babysit: found failed run id=%d, run_attempt=%d, name=%s",
+                    run['id'], run.get('run_attempt', 1), run.get('name', ''))
+        return failed_runs
+
     def remove_unwanted_workflows(self):
         """
         Remove two things:
diff --git a/bert_e/git_host/github/schema.py b/bert_e/git_host/github/schema.py
index 60370202..42a8bfa5 100644
--- a/bert_e/git_host/github/schema.py
+++ b/bert_e/git_host/github/schema.py
@@ -143,6 +143,9 @@ class WorkflowRun(GitHubSchema):
     event = fields.Str()
     repository = fields.Nested(Repo)
     workflow_id = fields.Integer()
+    # run_attempt indicates the number of times this workflow has been run
+    # Defaults to 1 for first run, increments with each rerun
+    run_attempt = fields.Integer(load_default=1)
 
 
 class AggregateWorkflowRuns(GitHubSchema):
diff --git a/bert_e/settings.py b/bert_e/settings.py
index d00cf318..2140553a 100644
--- a/bert_e/settings.py
+++ b/bert_e/settings.py
@@ -195,6 +195,9 @@ class Meta:
 
     send_bot_status = fields.Bool(required=False, load_default=False)
 
+    # Babysit feature: automatic retry of failed GitHub Actions
+    max_babysit_retries = fields.Int(required=False, load_default=5)
+
     @pre_load(pass_many=True)
     def load_env(self, data, **kwargs):
         """Load environment variables"""
diff --git a/bert_e/templates/babysit_cancelled.md b/bert_e/templates/babysit_cancelled.md
new file mode 100644
index 00000000..c4a39f46
--- /dev/null
+++ b/bert_e/templates/babysit_cancelled.md
@@ -0,0 +1,14 @@
+{% extends "message.md" %}
+
+{% block title -%}
+Babysit: Cancelled
+{% endblock %}
+
+{% block message %}
+**Babysit mode has been cancelled** because new commits were pushed to the branch.
+
+Previous retries were for commit `{{ previous_commit[:7] }}`, but the current commit is `{{ current_commit[:7] }}`.
+
+If you want to enable automatic retries for the new commits, please comment `@{{ robot }} babysit` again.
+{% endblock %}
+
diff --git a/bert_e/templates/babysit_exhausted.md b/bert_e/templates/babysit_exhausted.md
new file mode 100644
index 00000000..dd7eaed8
--- /dev/null
+++ b/bert_e/templates/babysit_exhausted.md
@@ -0,0 +1,23 @@
+{% extends "message.md" %}
+
+{% block title -%}
+Babysit: Maximum retries reached
+{% endblock %}
+
+{% block message %}
+The {% if build_url -%}[build]({{ build_url }}) {% else -%}build {% endif -%}
+has exhausted all automatic retry attempts on branch `{{ branch.name }}`.
+
+**Exhausted workflows** ({{ max_retries }} retries each):
+{% for wf in exhausted_workflows -%}
+- `{{ wf }}`
+{% endfor %}
+To investigate:
+- Review the [build logs]({{ build_url }}) for the failure cause
+- Check if this is a flaky test or a genuine issue
+
+To get more retries:
+- Fix the issue and push new commits (babysit will continue with fresh retries), or
+- Comment `@{{ robot }} babysit` again to reset the retry counter
+{% endblock %}
+
diff --git a/bert_e/templates/babysit_retry.md b/bert_e/templates/babysit_retry.md
new file mode 100644
index 00000000..73eaaa82
--- /dev/null
+++ b/bert_e/templates/babysit_retry.md
@@ -0,0 +1,20 @@
+{% extends "message.md" %}
+
+{% block title -%}
+Babysit: Retrying build
+{% endblock %}
+
+{% block message %}
+The {% if build_url -%}[build]({{ build_url }}) {% else -%}build {% endif -%}
+failed on branch `{{ branch.name }}` (commit `{{ commit_sha[:7] }}`).
+
+**Babysit mode is active** - automatically retrying failed workflows:
+
+| Workflow | Retry |
+|:---------|:-----:|
+{% for wf in workflows -%}
+| `{{ wf.name }}` | {{ wf.retry_count }}/{{ max_retries }} |
+{% endfor %}
+Please wait for the new build to complete.
+{% endblock %}
+
diff --git a/bert_e/tests/unit/test_babysit.py b/bert_e/tests/unit/test_babysit.py
new file mode 100644
index 00000000..038e5af0
--- /dev/null
+++ b/bert_e/tests/unit/test_babysit.py
@@ -0,0 +1,785 @@
+# Copyright 2016-2018 Scality
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for the babysit feature."""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from bert_e.exceptions import BabysitRetry, BabysitExhausted, BabysitCancelled
+from bert_e.git_host.github import AggregatedWorkflowRuns, Client
+from bert_e.workflow.gitwaterflow.babysit import (
+    count_babysit_retries_per_workflow, handle_babysit_retry,
+    BABYSIT_RETRY_MARKER, COMMIT_SHA_PATTERN, WORKFLOW_RETRY_PATTERN
+)
+from bert_e.workflow.gitwaterflow.queueing import (
+    _check_pr_babysit_enabled, _handle_queue_babysit_retry
+)
+
+
+@pytest.fixture
+def client():
+    return Client(
+        login='login',
+        password='password',
+        email='email@org.com',
+        base_url="http://localhost:4010",
+        accept_header="application/json"
+    )
+
+
+@pytest.fixture
+def failed_workflow_run_json():
+    """Workflow run JSON with a failed run on an integration branch (w/)."""
+    return {
+        'workflow_runs': [
+            {
+                'id': 12345,
+                'head_sha': 'd6fde92930d4715a2b49857d24b940956b26d2d3',
+                'head_branch': 'w/5.0/feature/test',
+                'status': 'completed',
+                'event': 'pull_request',
+                'workflow_id': 1,
+                'check_suite_id': 1,
+                'conclusion': 'failure',
+                'run_attempt': 2,
+                'name': 'CI Build',
+                'html_url': 'https://github.com/octo-org/Hello-World/actions/runs/12345',
+                'repository': {
+                    'full_name': 'octo-org/Hello-World',
+                    'owner': {'login': 'octo-org'},
+                    'name': 'Hello-World'
+                }
+            },
+        ],
+        'total_count': 1
+    }
+
+
+@pytest.fixture
+def successful_workflow_run_json():
+    """Workflow run JSON with a successful run."""
+    return {
+        'workflow_runs': [
+            {
+                'id': 12345,
+                'head_sha': 'd6fde92930d4715a2b49857d24b940956b26d2d3',
+                'head_branch': 'w/5.0/feature/test',
+                'status': 'completed',
+                'event': 'pull_request',
+                'workflow_id': 1,
+                'check_suite_id': 1,
+                'conclusion': 'success',
+                'run_attempt': 1,
+                'name': 'CI Build',
+                'repository': {
+                    'full_name': 'octo-org/Hello-World',
+                    'owner': {'login': 'octo-org'},
+                    'name': 'Hello-World'
+                }
+            },
+        ],
+        'total_count': 1
+    }
+
+
+class TestGetFailedRuns:
+    """Tests for AggregatedWorkflowRuns.get_failed_runs()."""
+
+    def test_get_failed_runs_returns_failed_workflows(
+            self, client, failed_workflow_run_json):
+        """Test that get_failed_runs returns failed workflow runs."""
+        workflow_runs = AggregatedWorkflowRuns(
+            client, **failed_workflow_run_json)
+
+        failed_runs = workflow_runs.get_failed_runs()
+
+        assert len(failed_runs) == 1
+        assert failed_runs[0]['id'] == 12345
+        assert failed_runs[0]['run_attempt'] == 2
+        assert failed_runs[0]['workflow_id'] == 1
+        assert failed_runs[0]['name'] == 'CI Build'
+
+    def test_get_failed_runs_returns_empty_for_successful(
+            self, client, successful_workflow_run_json):
+        """Test that get_failed_runs returns empty for successful runs."""
+        workflow_runs = AggregatedWorkflowRuns(
+            client, **successful_workflow_run_json)
+
+        failed_runs = workflow_runs.get_failed_runs()
+
+        assert len(failed_runs) == 0
+
+    def test_get_failed_runs_default_run_attempt(self, client):
+        """Test that run_attempt defaults to 1 if not present."""
+        workflow_run_json = {
+            'workflow_runs': [
+                {
+                    'id': 99999,
+                    'head_sha': 'abc123',
+                    'head_branch': 'feature',
+                    'status': 'completed',
+                    'event': 'pull_request',
+                    'workflow_id': 1,
+                    'check_suite_id': 1,
+                    'conclusion': 'failure',
+                    # run_attempt not present
+                    'repository': {
+                        'full_name': 'org/repo',
+                        'owner': {'login': 'org'},
+                        'name': 'repo'
+                    }
+                },
+            ],
+            'total_count': 1
+        }
+        workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json)
+
+        failed_runs = workflow_runs.get_failed_runs()
+
+        assert len(failed_runs) == 1
+        assert failed_runs[0]['run_attempt'] == 1  # default
+
+
+class TestPatterns:
+    """Tests for regex patterns."""
+
+    def test_commit_sha_pattern_matches_short_sha(self):
+        """Test matching a short commit SHA (7 chars)."""
+        text = 'branch `w/5.0/test` (commit `abc1234`)'
+        match = COMMIT_SHA_PATTERN.search(text)
+        assert match is not None
+        assert match.group(1) == 'abc1234'
+
+    def test_commit_sha_pattern_matches_full_sha(self):
+        """Test matching a full commit SHA (40 chars)."""
+        full_sha = 'd6fde92930d4715a2b49857d24b940956b26d2d3'
+        text = f'(commit `{full_sha}`)'
+        match = COMMIT_SHA_PATTERN.search(text)
+        assert match is not None
+        assert match.group(1) == full_sha
+
+    def test_commit_sha_pattern_no_match_without_backticks(self):
+        """Test that commit without backticks doesn't match."""
+        text = '(commit abc1234)'
+        match = COMMIT_SHA_PATTERN.search(text)
+        assert match is None
+
+    def test_workflow_retry_pattern_matches(self):
+        """Test that workflow retry pattern matches table rows."""
+        text = '| `CI Build` | 2/5 |'
+        match = WORKFLOW_RETRY_PATTERN.search(text)
+        assert match is not None
+        assert match.group(1) == 'CI Build'
+        assert match.group(2) == '2'
+        assert match.group(3) == '5'
+
+    def test_workflow_retry_pattern_matches_multiple(self):
+        """Test extracting multiple workflows from a table."""
+        text = '''| Workflow | Retry |
+|:---------|:-----:|
+| `CI Build` | 1/5 |
+| `Tests` | 3/5 |
+| `Lint` | 2/5 |
+'''
+        matches = list(WORKFLOW_RETRY_PATTERN.finditer(text))
+        assert len(matches) == 3
+        assert matches[0].group(1) == 'CI Build'
+        assert matches[1].group(1) == 'Tests'
+        assert matches[2].group(1) == 'Lint'
+
+
+class TestCountBabysitRetriesPerWorkflow:
+    """Tests for count_babysit_retries_per_workflow function."""
+
+    def _make_comment(self, author, text):
+        """Create a mock comment."""
+        comment = MagicMock()
+        comment.author = author
+        comment.text = text
+        return comment
+
+    def _make_pr(self, comments):
+        """Create a mock PR with comments."""
+        pr = MagicMock()
+        pr.comments = comments
+        return pr
+
+    def _make_retry_comment(self, branch_name, commit, workflows):
+        """Create a BabysitRetry-like comment text."""
+        lines = [
+            BABYSIT_RETRY_MARKER,
+            f'failed on branch `{branch_name}` (commit `{commit[:7]}`)',
+            '| Workflow | Retry |',
+            '|:---------|:-----:|',
+        ]
+        for wf_name, retry_count, max_retries in workflows:
+            lines.append(f'| `{wf_name}` | {retry_count}/{max_retries} |')
+        return '\n'.join(lines)
+
+    def test_no_comments_returns_empty(self):
+        """Test counting with no comments returns empty dict."""
+        pr = self._make_pr([])
+        retries, is_stale, prev = count_babysit_retries_per_workflow(
+            pr, 'bert-e', 'w/5.0/feature/test', 'abc1234567890')
+        assert retries == {}
+        assert is_stale is False
+        assert prev is None
+
+    def test_counts_retries_per_workflow(self):
+        """Test counting retries per workflow from comments."""
+        branch_name = 'w/5.0/feature/test'
+        commit = 'abc1234567890'
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', 1, 5), ('Tests', 1, 5)]
+            )),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', 2, 5)]  # Only CI failed this time
+            )),
+        ]
+        pr = self._make_pr(comments)
+        retries, is_stale, prev = count_babysit_retries_per_workflow(
+            pr, 'bert-e', branch_name, commit)
+
+        # CI Build was retried twice, Tests once
+        assert retries == {'CI Build': 2, 'Tests': 1}
+        assert is_stale is False
+
+    def test_babysit_command_resets_all_counts(self):
+        """Test that a new /babysit command resets all workflow counts."""
+        branch_name = 'w/5.0/feature/test'
+        commit = 'abc1234567890'
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', 1, 5), ('Tests', 1, 5)]
+            )),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', 2, 5), ('Tests', 2, 5)]
+            )),
+            # User re-invokes babysit
+            self._make_comment('user', '@bert-e babysit'),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', 1, 5)]
+            )),
+        ]
+        pr = self._make_pr(comments)
+        retries, is_stale, prev = count_babysit_retries_per_workflow(
+            pr, 'bert-e', branch_name, commit)
+
+        # Only 1 retry for CI Build since the reset
+        assert retries == {'CI Build': 1}
+        assert is_stale is False
+
+    def test_detects_stale_babysit(self):
+        """Test detection of stale babysit when commit changed."""
+        branch_name = 'w/5.0/feature/test'
+        old_commit = 'abc1234567890'
+        new_commit = 'def9876543210'
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, old_commit,
+                [('CI Build', 1, 5)]
+            )),
+        ]
+        pr = self._make_pr(comments)
+        retries, is_stale, prev = count_babysit_retries_per_workflow(
+            pr, 'bert-e', branch_name, new_commit)
+
+        assert retries == {'CI Build': 1}
+        assert is_stale is True
+        assert prev == old_commit[:7]
+
+    def test_new_babysit_clears_stale(self):
+        """Test that re-invoking /babysit clears stale flag."""
+        branch_name = 'w/5.0/feature/test'
+        old_commit = 'abc1234567890'
+        new_commit = 'def9876543210'
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, old_commit,
+                [('CI Build', 1, 5)]
+            )),
+            # User pushes new commit and re-invokes babysit
+            self._make_comment('user', '@bert-e babysit'),
+        ]
+        pr = self._make_pr(comments)
+        retries, is_stale, prev = count_babysit_retries_per_workflow(
+            pr, 'bert-e', branch_name, new_commit)
+
+        assert retries == {}
+        assert is_stale is False
+        assert prev is None
+
+
+class TestBabysitExceptions:
+    """Tests for BabysitRetry, BabysitExhausted, and BabysitCancelled."""
+
+    def test_babysit_retry_exception(self):
+        """Test BabysitRetry exception creation."""
+        branch = SimpleNamespace(name='w/5.0/feature/test')
+        exc = BabysitRetry(
+            active_options=['babysit'],
+            branch=branch,
+            build_url='https://github.com/org/repo/actions/runs/123',
+            commit_sha='abc1234567890',
+            workflows=[
+                {'id': 1, 'name': 'CI Build', 'retry_count': 2},
+                {'id': 2, 'name': 'Tests', 'retry_count': 1},
+            ],
+            max_retries=5,
+        )
+
+        assert exc.code == 140
+        assert exc.status == "in_progress"
+
+    def test_babysit_exhausted_exception(self):
+        """Test BabysitExhausted exception creation."""
+        branch = SimpleNamespace(name='w/5.0/feature/test')
+        exc = BabysitExhausted(
+            active_options=['babysit'],
+            branch=branch,
+            build_url='https://github.com/org/repo/actions/runs/123',
+            max_retries=5,
+            robot='bert-e',
+            exhausted_workflows=['CI Build', 'Tests'],
+        )
+
+        assert exc.code == 141
+        assert exc.status == "failure"
+
+    def test_babysit_cancelled_exception(self):
+        """Test BabysitCancelled exception creation."""
+        branch = SimpleNamespace(name='w/5.0/feature/test')
+        exc = BabysitCancelled(
+            active_options=['babysit'],
+            branch=branch,
+            previous_commit='abc1234567890',
+            current_commit='def9876543210',
+            robot='bert-e',
+        )
+
+        assert exc.code == 142
+        assert exc.status == "in_progress"
+
+
+class TestHandleBabysitRetry:
+    """Tests for handle_babysit_retry function."""
+
+    def _make_comment(self, author, text):
+        """Create a mock comment."""
+        comment = MagicMock()
+        comment.author = author
+        comment.text = text
+        return comment
+
+    def _make_retry_comment(self, branch_name, commit, workflows):
+        """Create a BabysitRetry-like comment text."""
+        lines = [
+            BABYSIT_RETRY_MARKER,
+            f'failed on branch `{branch_name}` (commit `{commit[:7]}`)',
+            '| Workflow | Retry |',
+            '|:---------|:-----:|',
+        ]
+        for wf_name, retry_count, max_retries in workflows:
+            lines.append(f'| `{wf_name}` | {retry_count}/{max_retries} |')
+        return '\n'.join(lines)
+
+    def _make_job(self, babysit=False, host='github', build_key='github_actions',
+                  max_retries=5, comments=None):
+        """Create a mock job with settings."""
+        settings = SimpleNamespace(
+            babysit=babysit,
+            repository_host=host,
+            repository_owner='octo-org',
+            repository_slug='Hello-World',
+            max_babysit_retries=max_retries,
+            robot='bert-e',
+        )
+        project_repo = MagicMock()
+        project_repo.get_build_url.return_value = 'https://example.com/build'
+        project_repo.rerun_failed_workflow_jobs = MagicMock()
+
+        pull_request = MagicMock()
+        pull_request.comments = comments or []
+
+        job = SimpleNamespace(
+            settings=settings,
+            project_repo=project_repo,
+            active_options=['babysit'] if babysit else [],
+            pull_request=pull_request,
+        )
+        return job
+
+    def _make_branch(self, commit='abc1234567890', name='w/5.0/feature/test'):
+        """Create a mock branch."""
+        branch = MagicMock()
+        branch.name = name
+        branch.get_latest_commit.return_value = commit
+        return branch
+
+    def test_babysit_disabled_returns_false(self):
+        """Test that babysit logic is skipped when disabled."""
+        job = self._make_job(babysit=False)
+        branch = self._make_branch()
+
+        result = handle_babysit_retry(job, branch, 'github_actions')
+
+        assert result is False
+
+    def test_babysit_skips_non_github(self):
+        """Test that babysit is skipped for non-GitHub hosts."""
+        job = self._make_job(babysit=True, host='bitbucket')
+        branch = self._make_branch()
+
+        result = handle_babysit_retry(job, branch, 'github_actions')
+
+        assert result is False
+
+    def test_babysit_retry_per_workflow(self, client):
+        """Test that babysit tracks retries per workflow."""
+        branch_name = 'w/5.0/feature/test'
+        commit = 'abc1234567890'
+
+        # CI Build has 2 retries, Tests has 0
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', 1, 5)]
+            )),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', 2, 5)]
+            )),
+        ]
+        job = self._make_job(babysit=True, max_retries=5, comments=comments)
+        branch = self._make_branch(commit=commit, name=branch_name)
+
+        # Both workflows fail
+        workflow_run_json = {
+            'workflow_runs': [
+                {
+                    'id': 11111,
+                    'head_sha': commit,
+                    'head_branch': branch_name,
+                    'status': 'completed',
+                    'event': 'pull_request',
+                    'workflow_id': 1,
+                    'check_suite_id': 1,
+                    'conclusion': 'failure',
+                    'run_attempt': 3,
+                    'name': 'CI Build',
+                    'repository': {
+                        'full_name': 'octo-org/Hello-World',
+                        'owner': {'login': 'octo-org'},
+                        'name': 'Hello-World'
+                    }
+                },
+                {
+                    'id': 22222,
+                    'head_sha': commit,
+                    'head_branch': branch_name,
+                    'status': 'completed',
+                    'event': 'pull_request',
+                    'workflow_id': 2,
+                    'check_suite_id': 2,
+                    'conclusion': 'failure',
+                    'run_attempt': 1,
+                    'name': 'Tests',
+                    'repository': {
+                        'full_name': 'octo-org/Hello-World',
+                        'owner': {'login': 'octo-org'},
+                        'name': 'Hello-World'
+                    }
+                },
+            ],
+            'total_count': 2
+        }
+        workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json)
+
+        with patch.object(AggregatedWorkflowRuns, 'get',
+                          return_value=workflow_runs):
+            with pytest.raises(BabysitRetry) as exc_info:
+                handle_babysit_retry(job, branch, 'github_actions')
+
+        # Both workflows should be retried
+        assert job.project_repo.rerun_failed_workflow_jobs.call_count == 2
+
+        # Check the workflows in the exception
+        workflows = exc_info.value.kwargs['workflows']
+        wf_names = {wf['name'] for wf in workflows}
+        assert 'CI Build' in wf_names
+        assert 'Tests' in wf_names
+
+        # CI Build should be at retry 3, Tests at retry 1
+        ci_wf = next(wf for wf in workflows if wf['name'] == 'CI Build')
+        tests_wf = next(wf for wf in workflows if wf['name'] == 'Tests')
+        assert ci_wf['retry_count'] == 3
+        assert tests_wf['retry_count'] == 1
+
+    def test_workflow_exhausted_individually(self, client):
+        """Test that workflows are exhausted individually."""
+        branch_name = 'w/5.0/feature/test'
+        commit = 'abc1234567890'
+
+        # CI Build has 5 retries (exhausted), Tests has 2
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+        ]
+        for i in range(5):
+            comments.append(self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', i + 1, 5)]
+            )))
+        for i in range(2):
+            comments.append(self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('Tests', i + 1, 5)]
+            )))
+
+        job = self._make_job(babysit=True, max_retries=5, comments=comments)
+        branch = self._make_branch(commit=commit, name=branch_name)
+
+        # Both workflows fail
+        workflow_run_json = {
+            'workflow_runs': [
+                {
+                    'id': 11111,
+                    'head_sha': commit,
+                    'head_branch': branch_name,
+                    'status': 'completed',
+                    'event': 'pull_request',
+                    'workflow_id': 1,
+                    'check_suite_id': 1,
+                    'conclusion': 'failure',
+                    'run_attempt': 6,
+                    'name': 'CI Build',
+                    'repository': {
+                        'full_name': 'octo-org/Hello-World',
+                        'owner': {'login': 'octo-org'},
+                        'name': 'Hello-World'
+                    }
+                },
+                {
+                    'id': 22222,
+                    'head_sha': commit,
+                    'head_branch': branch_name,
+                    'status': 'completed',
+                    'event': 'pull_request',
+                    'workflow_id': 2,
+                    'check_suite_id': 2,
+                    'conclusion': 'failure',
+                    'run_attempt': 3,
+                    'name': 'Tests',
+                    'repository': {
+                        'full_name': 'octo-org/Hello-World',
+                        'owner': {'login': 'octo-org'},
+                        'name': 'Hello-World'
+                    }
+                },
+            ],
+            'total_count': 2
+        }
+        workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json)
+
+        with patch.object(AggregatedWorkflowRuns, 'get',
+                          return_value=workflow_runs):
+            with pytest.raises(BabysitRetry) as exc_info:
+                handle_babysit_retry(job, branch, 'github_actions')
+
+        # Only Tests should be retried (CI Build is exhausted)
+        job.project_repo.rerun_failed_workflow_jobs.assert_called_once_with(22222)
+
+        # Check that only Tests is in the retry list
+        workflows = exc_info.value.kwargs['workflows']
+        assert len(workflows) == 1
+        assert workflows[0]['name'] == 'Tests'
+        assert workflows[0]['retry_count'] == 3
+
+    def test_all_workflows_exhausted(self, client):
+        """Test that BabysitExhausted is raised when all workflows exhausted."""
+        branch_name = 'w/5.0/feature/test'
+        commit = 'abc1234567890'
+
+        # Both workflows have 5 retries (exhausted)
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+        ]
+        for i in range(5):
+            comments.append(self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, commit,
+                [('CI Build', i + 1, 5), ('Tests', i + 1, 5)]
+            )))
+
+        job = self._make_job(babysit=True, max_retries=5, comments=comments)
+        branch = self._make_branch(commit=commit, name=branch_name)
+
+        # Both workflows fail
+        workflow_run_json = {
+            'workflow_runs': [
+                {
+                    'id': 11111,
+                    'head_sha': commit,
+                    'head_branch': branch_name,
+                    'status': 'completed',
+                    'event': 'pull_request',
+                    'workflow_id': 1,
+                    'check_suite_id': 1,
+                    'conclusion': 'failure',
+                    'run_attempt': 6,
+                    'name': 'CI Build',
+                    'repository': {
+                        'full_name': 'octo-org/Hello-World',
+                        'owner': {'login': 'octo-org'},
+                        'name': 'Hello-World'
+                    }
+                },
+                {
+                    'id': 22222,
+                    'head_sha': commit,
+                    'head_branch': branch_name,
+                    'status': 'completed',
+                    'event': 'pull_request',
+                    'workflow_id': 2,
+                    'check_suite_id': 2,
+                    'conclusion': 'failure',
+                    'run_attempt': 6,
+                    'name': 'Tests',
+                    'repository': {
+                        'full_name': 'octo-org/Hello-World',
+                        'owner': {'login': 'octo-org'},
+                        'name': 'Hello-World'
+                    }
+                },
+            ],
+            'total_count': 2
+        }
+        workflow_runs = AggregatedWorkflowRuns(client, **workflow_run_json)
+
+        with patch.object(AggregatedWorkflowRuns, 'get',
+                          return_value=workflow_runs):
+            with pytest.raises(BabysitExhausted) as exc_info:
+                handle_babysit_retry(job, branch, 'github_actions')
+
+        # No reruns should be called
+        job.project_repo.rerun_failed_workflow_jobs.assert_not_called()
+
+        # Check exhausted workflows
+        exhausted = exc_info.value.kwargs['exhausted_workflows']
+        assert 'CI Build' in exhausted
+        assert 'Tests' in exhausted
+
+    def test_babysit_cancelled_on_new_commit(self, client):
+        """Test that babysit is cancelled when new commits are pushed."""
+        branch_name = 'w/5.0/feature/test'
+        old_commit = 'abc1234567890'
+        new_commit = 'def9876543210'
+
+        comments = [
+            self._make_comment('user', '@bert-e babysit'),
+            self._make_comment('bert-e', self._make_retry_comment(
+                branch_name, old_commit,
+                [('CI Build', 1, 5)]
+            )),
+        ]
+
+        job = self._make_job(babysit=True, max_retries=5, comments=comments)
+        branch = self._make_branch(commit=new_commit, name=branch_name)
+
+        with pytest.raises(BabysitCancelled) as exc_info:
+            handle_babysit_retry(job, branch, 'github_actions')
+
+        assert exc_info.value.kwargs['previous_commit'] == old_commit[:7]
+        assert exc_info.value.kwargs['current_commit'] == new_commit
+
+
+class TestCheckPrBabysitEnabled:
+    """Tests for _check_pr_babysit_enabled function."""
+
+    def test_babysit_enabled_in_comments(self):
+        """Test detecting babysit option from PR comments."""
+        comment = MagicMock()
+        comment.author = 'user'
+        comment.text = '@bert-e babysit'
+
+        pull_request = MagicMock()
+        pull_request.comments = [comment]
+        pull_request.author = 'user'
+
+        settings = SimpleNamespace(
+            robot='bert-e',
+            admins=[],
+        )
+
+        result = _check_pr_babysit_enabled(pull_request, settings)
+        assert result is True
+
+    def test_babysit_not_enabled(self):
+        """Test when babysit is not in PR comments."""
+        comment = MagicMock()
+        comment.author = 'user'
+        comment.text = '@bert-e approve'
+
+        pull_request = MagicMock()
+        pull_request.comments = [comment]
+        pull_request.author = 'user'
+
+        settings = SimpleNamespace(
+            robot='bert-e',
+            admins=[],
+        )
+
+        result = _check_pr_babysit_enabled(pull_request, settings)
+        assert result is False
+
+
+class TestQueueBabysitRetry:
+    """Tests for queue babysit retry functionality."""
+
+    def test_queue_babysit_skips_non_github(self):
+        """Test queue babysit is skipped for non-GitHub hosts."""
+        settings = SimpleNamespace(
+            repository_host='bitbucket',
+            max_babysit_retries=5,
+        )
+        job = SimpleNamespace(settings=settings)
+        queues = MagicMock()
+        queues.build_key = 'github_actions'
+
+        result = _handle_queue_babysit_retry(job, queues, [123])
+        assert result is False
+
+    def test_queue_babysit_skips_non_github_actions(self):
+        """Test queue babysit is skipped for non-github_actions build key."""
+        settings = SimpleNamespace(
+            repository_host='github',
+            max_babysit_retries=5,
+        )
+        job = SimpleNamespace(settings=settings)
+        queues = MagicMock()
+        queues.build_key = 'pre-merge'
+
+        result = _handle_queue_babysit_retry(job, queues, [123])
+        assert result is False
diff --git a/bert_e/workflow/gitwaterflow/__init__.py b/bert_e/workflow/gitwaterflow/__init__.py
index 7bda9d6c..3e02636c 100644
--- a/bert_e/workflow/gitwaterflow/__init__.py
+++ b/bert_e/workflow/gitwaterflow/__init__.py
@@ -628,6 +628,8 @@ def check_build_status(job, wbranches):
         BuildFailed: if a build failed or was stopped.
         BuildNotStarted: if a build hasn't started yet.
         BuildInProgress: if a build is still in progress.
+        BabysitRetry: if babysit mode is active and retrying failed jobs.
+        BabysitExhausted: if babysit mode exhausted all retries.
 
     """
 
@@ -651,6 +653,10 @@ def status(branch):
     worst = max(wbranches, key=lambda b: ordered_state[statuses[b.name]])
     worst_status = statuses[worst.name]
     if worst_status in ('FAILED', 'STOPPED'):
+        # Check if babysit mode should handle the failure
+        if _handle_babysit_retry(job, worst, key):
+            return  # Babysit handled the failure (raised an exception)
+
         raise messages.BuildFailed(
             active_options=job.active_options,
             branch=worst,
@@ -667,3 +673,7 @@ def status(branch):
     elif worst_status == 'INPROGRESS':
         raise messages.BuildInProgress()
     assert worst_status == 'SUCCESSFUL'
+
+
+# Import the shared babysit module
+from .babysit import handle_babysit_retry as _handle_babysit_retry
diff --git a/bert_e/workflow/gitwaterflow/babysit.py b/bert_e/workflow/gitwaterflow/babysit.py
new file mode 100644
index 00000000..887e6ced
--- /dev/null
+++ b/bert_e/workflow/gitwaterflow/babysit.py
@@ -0,0 +1,290 @@
+# Copyright 2016-2026 Scality
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Babysit feature - automatic retry of failed GitHub Actions builds.
+
+This module provides the babysit functionality that automatically retries
+failed GitHub Actions builds when the /babysit option is enabled on a
+pull request. The feature:
+
+1. Monitors build failures on any branch (integration branches w/*, queue
+   branches, etc.)
+2. Automatically triggers GitHub's "Re-run failed jobs" for failed workflow
+   runs
+3. Tracks retry count PER WORKFLOW by parsing Bert-E's BabysitRetry comments
+   since the last /babysit command from the user
+4. After max_babysit_retries (configurable, default 5) for a workflow, that
+   workflow is considered exhausted. When ALL failed workflows are exhausted,
+   posts a BabysitExhausted notification
+5. Users can comment /babysit again to reset all retry counters and get
+   additional retries
+6. If new commits are pushed, babysit is cancelled and must be re-invoked
+
+"""
+import logging
+import re
+from collections import defaultdict
+
+from bert_e import exceptions as messages
+
+
+LOG = logging.getLogger(__name__)
+
+# Marker text used to identify BabysitRetry comments from Bert-E
+BABYSIT_RETRY_MARKER = "Babysit: Retrying build"
+
+# Regex to extract commit SHA from BabysitRetry comments
+# Matches: (commit `abc1234`) where abc1234 is 7+ hex chars
+COMMIT_SHA_PATTERN = re.compile(r'\(commit `([a-f0-9]{7,40})`\)')
+
+# Regex to extract workflow names from the retry table
+# Matches: | `workflow_name` | X/Y |
+WORKFLOW_RETRY_PATTERN = re.compile(r'\| `([^`]+)` \| (\d+)/(\d+) \|')
+
+
+def count_babysit_retries_per_workflow(pull_request, robot_name, branch_name,
+                                        current_commit):
+    """Count how many babysit retries have been done for each workflow.
+
+    This parses BabysitRetry comments posted by Bert-E for the specific branch
+    since the last /babysit command, and extracts retry counts per workflow.
+
+    Also detects if babysit is "stale" - i.e., retries were done for a
+    different commit than the current one.
+
+    Args:
+        pull_request: The pull request to check comments on.
+        robot_name: The robot's username (e.g., "bert-e").
+        branch_name: The branch name to count retries for.
+        current_commit: The current commit SHA on the branch.
+
+    Returns:
+        tuple: (workflow_retries, is_stale, previous_commit)
+            - workflow_retries: dict mapping workflow name to retry count
+            - is_stale: True if retries were for a different commit.
+            - previous_commit: The commit SHA from previous retries (if stale),
+                               or None.
+
+    """
+    workflow_retries = defaultdict(int)
+    previous_commit = None
+    is_stale = False
+
+    # Pattern to find /babysit command from users
+    babysit_cmd_pattern = re.compile(
+        r'@' + re.escape(robot_name) + r'\s+babysit\b',
+        re.IGNORECASE
+    )
+
+    for comment in pull_request.comments:
+        author = comment.author
+        text = comment.text
+
+        if author == robot_name:
+            # Check if this is a BabysitRetry comment for our branch
+            if (BABYSIT_RETRY_MARKER in text and
+                    f"`{branch_name}`" in text):
+
+                # Extract commit SHA from the comment
+                sha_match = COMMIT_SHA_PATTERN.search(text)
+                if sha_match:
+                    comment_commit = sha_match.group(1)
+                    # Check if this retry was for a different commit
+                    if not current_commit.startswith(comment_commit):
+                        is_stale = True
+                        previous_commit = comment_commit
+
+                # Extract workflow retry counts from the table
+                for wf_match in WORKFLOW_RETRY_PATTERN.finditer(text):
+                    workflow_name = wf_match.group(1)
+                    # The retry count in the message is the current retry number
+                    # We just need to track that this workflow was retried
+                    workflow_retries[workflow_name] += 1
+        else:
+            # Check if user sent /babysit command - this resets all counts
+            if babysit_cmd_pattern.search(text):
+                workflow_retries = defaultdict(int)
+                is_stale = False
+                previous_commit = None
+
+    return dict(workflow_retries), is_stale, previous_commit
+
+
+def handle_babysit_retry(job, failed_branch, build_key, pull_request=None):
+    """Handle babysit retry logic for failed builds.
+
+    This function is called when a build fails and the babysit option is
+    enabled. It will automatically retry the failed GitHub Actions jobs
+    up to max_babysit_retries times PER WORKFLOW.
+
+    The retry count is tracked per workflow by parsing Bert-E's BabysitRetry
+    comments since the last /babysit command. This allows users to re-invoke
+    /babysit to get additional retries after exhaustion.
+
+    If new commits are pushed after babysit was invoked, babysit is
+    cancelled and the user must re-invoke it.
+
+    Args:
+        job: The current job.
+        failed_branch: The branch with the failed build (integration branch,
+                       queue branch, or any branch).
+        build_key: The build key being checked (must be 'github_actions').
+        pull_request: Optional pull request to check for comments. If not
+                      provided, uses job.pull_request.
+
+    Returns:
+        True if babysit handled the failure (always raises an exception).
+        False if babysit is not applicable.
+
+    Raises:
+        BabysitRetry: if retrying the failed jobs.
+        BabysitExhausted: if max retries reached for all failed workflows.
+        BabysitCancelled: if new commits were pushed since babysit was invoked.
+
+    """
+    # Use the provided pull_request or fall back to job's pull_request
+    pr = pull_request or getattr(job, 'pull_request', None)
+    if pr is None:
+        LOG.debug("Babysit: no pull request available")
+        return False
+
+    # Check if babysit is enabled
+    if not job.settings.babysit:
+        return False
+
+    # Babysit only works for GitHub with github_actions build key
+    if job.settings.repository_host != 'github':
+        LOG.debug("Babysit: skipping, not GitHub host")
+        return False
+
+    if build_key != 'github_actions':
+        LOG.debug("Babysit: skipping, build_key is not github_actions")
+        return False
+
+    branch_name = failed_branch.name
+    commit_sha = failed_branch.get_latest_commit()
+    max_retries = job.settings.max_babysit_retries
+
+    LOG.info("Babysit: checking failed build on branch %s (commit %s)",
+             branch_name, commit_sha[:7])
+
+    # Count existing retries per workflow and check for stale babysit
+    workflow_retries, is_stale, previous_commit = \
+        count_babysit_retries_per_workflow(
+            pr, job.settings.robot, branch_name, commit_sha
+        )
+
+    LOG.info("Babysit: branch=%s, workflow_retries=%s, is_stale=%s",
+             branch_name, workflow_retries, is_stale)
+
+    build_url = job.project_repo.get_build_url(commit_sha, build_key)
+
+    # Check if babysit is stale (new commits pushed since babysit was invoked)
+    if is_stale and previous_commit:
+        LOG.info("Babysit: cancelled for %s due to new commits "
+                 "(was: %s, now: %s)",
+                 branch_name, previous_commit, commit_sha[:7])
+        raise messages.BabysitCancelled(
+            active_options=job.active_options,
+            branch=failed_branch,
+            previous_commit=previous_commit,
+            current_commit=commit_sha,
+            robot=job.settings.robot,
+        )
+
+    # Get the workflow runs for the failed commit
+    from bert_e.git_host.github import AggregatedWorkflowRuns
+
+    try:
+        workflow_runs = AggregatedWorkflowRuns.get(
+            client=job.project_repo.client,
+            owner=job.settings.repository_owner,
+            repo=job.settings.repository_slug,
+            params={'head_sha': commit_sha}
+        )
+    except Exception as err:
+        LOG.warning("Babysit: failed to get workflow runs for %s: %s",
+                    branch_name, err)
+        return False
+
+    # Get failed runs
+    failed_runs = workflow_runs.get_failed_runs()
+    if not failed_runs:
+        LOG.debug("Babysit: no failed workflow runs found for %s", branch_name)
+        return False
+
+    # Categorize workflows: which can be retried, which are exhausted
+    workflows_to_retry = []
+    exhausted_workflows = []
+
+    for run in failed_runs:
+        workflow_name = run.get('name', f"workflow_{run['id']}")
+        current_count = workflow_retries.get(workflow_name, 0)
+
+        if current_count >= max_retries:
+            LOG.info("Babysit: workflow '%s' exhausted (%d/%d)",
+                     workflow_name, current_count, max_retries)
+            exhausted_workflows.append(workflow_name)
+        else:
+            workflows_to_retry.append({
+                'id': run['id'],
+                'name': workflow_name,
+                'retry_count': current_count + 1,  # This will be the new count
+            })
+
+    # If all failed workflows are exhausted, raise BabysitExhausted
+    if not workflows_to_retry and exhausted_workflows:
+        LOG.info("Babysit: all workflows exhausted for %s: %s",
+                 branch_name, exhausted_workflows)
+        raise messages.BabysitExhausted(
+            active_options=job.active_options,
+            branch=failed_branch,
+            build_url=build_url,
+            max_retries=max_retries,
+            robot=job.settings.robot,
+            exhausted_workflows=exhausted_workflows,
+        )
+
+    # If no workflows to retry (but also none exhausted), something is off
+    if not workflows_to_retry:
+        LOG.warning("Babysit: no workflows to retry and none exhausted for %s",
+                    branch_name)
+        return False
+
+    # Trigger re-run of each workflow that hasn't exhausted retries
+    rerun_triggered = False
+    for wf in workflows_to_retry:
+        try:
+            LOG.info("Babysit: re-running failed jobs for workflow '%s' "
+                     "(id=%d) on %s, retry %d/%d",
+                     wf['name'], wf['id'], branch_name,
+                     wf['retry_count'], max_retries)
+            job.project_repo.rerun_failed_workflow_jobs(wf['id'])
+            rerun_triggered = True
+        except Exception as err:
+            LOG.warning("Babysit: failed to rerun workflow %d (%s) on %s: %s",
+                        wf['id'], wf['name'], branch_name, err)
+
+    if not rerun_triggered:
+        LOG.warning("Babysit: could not trigger any reruns for %s", branch_name)
+        return False
+
+    # Raise BabysitRetry with per-workflow information
+    raise messages.BabysitRetry(
+        active_options=job.active_options,
+        branch=failed_branch,
+        build_url=build_url,
+        commit_sha=commit_sha,
+        workflows=workflows_to_retry,
+        max_retries=max_retries,
+    )
diff --git a/bert_e/workflow/gitwaterflow/commands.py b/bert_e/workflow/gitwaterflow/commands.py
index eb5b5bc2..7526b5b3 100644
--- a/bert_e/workflow/gitwaterflow/commands.py
+++ b/bert_e/workflow/gitwaterflow/commands.py
@@ -228,3 +228,8 @@ def setup(defaults={}):
         "wait",
         "Instruct Bert-E not to run until further notice.",
         default=defaults.get("wait", False))
+    Reactor.add_option(
+        "babysit",
+        "Automatically retry failed GitHub Actions builds.",
+        privileged=False,
+        default=defaults.get("babysit", False))
diff --git a/bert_e/workflow/gitwaterflow/queueing.py b/bert_e/workflow/gitwaterflow/queueing.py
index ccc4afab..22b5b5bb 100644
--- a/bert_e/workflow/gitwaterflow/queueing.py
+++ b/bert_e/workflow/gitwaterflow/queueing.py
@@ -15,11 +15,13 @@
 
 import logging
 from copy import deepcopy
+from types import SimpleNamespace
 
 from bert_e import exceptions
 from bert_e.job import handler as job_handler
 from bert_e.job import QueuesJob, PullRequestJob
 from bert_e.lib import git
+from bert_e.reactor import Reactor
 
 from ..git_utils import clone_git_repo, consecutive_merge, robust_merge, push
 from ..pr_utils import notify_user
@@ -28,12 +30,140 @@
                        QueueIntegrationBranch, branch_factory,
                        build_queue_collection)
 from .integration import get_integration_branches
-from typing import List
+from typing import List, Dict, Any
 
 
 LOG = logging.getLogger(__name__)
 
 
+def _check_pr_babysit_enabled(pull_request, settings) -> bool:
+    """Check if the babysit option is enabled for a pull request.
+
+    Args:
+        pull_request: The pull request to check.
+        settings: The bot settings.
+
+    Returns:
+        True if babysit is enabled, False otherwise.
+    """
+    # Create a temporary job-like object to parse options
+    temp_job = SimpleNamespace(settings={})
+    reactor = Reactor()
+    reactor.init_settings(temp_job)
+
+    prefix = '@{}'.format(settings.robot)
+    admins = settings.admins
+
+    # Parse options from comments (ignore errors, just check for babysit)
+    for comment in pull_request.comments:
+        author = comment.author
+        privileged = author in admins
+        authored = author == pull_request.author
+        text = comment.text
+        try:
+            reactor.handle_options(temp_job, text, prefix, privileged, authored)
+        except Exception:
+            # Ignore errors, we just want to check for babysit
+            pass
+
+    return temp_job.settings.get('babysit', False)
+
+
+def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection,
+                                 failed_prs: List[int]) -> bool:
+    """Handle babysit retry logic for failed queue builds.
+
+    For each failed PR in the queue that has babysit enabled, this function
+    will use the shared babysit logic to retry the failed GitHub Actions jobs
+    on the queue branch.
+
+    The retry counting is done by counting Bert-E's BabysitRetry comments
+    for each specific branch since the last /babysit command from the user.
+    This allows users to comment /babysit again to get additional retries.
+
+    Args:
+        job: The queue job.
+        queues: The queue collection with build status info.
+        failed_prs: List of PR IDs with failed builds.
+
+    Returns:
+        True if any retries were triggered.
+        False if no babysit retries were applicable.
+    """
+    from .babysit import handle_babysit_retry
+    from ..pr_utils import notify_user
+
+    # Babysit only works for GitHub with github_actions build key
+    if job.settings.repository_host != 'github':
+        LOG.debug("Queue babysit: skipping, not GitHub host")
+        return False
+
+    if queues.build_key != 'github_actions':
+        LOG.debug("Queue babysit: skipping, build_key is not github_actions")
+        return False
+
+    retried_any = False
+
+    for pr_id in failed_prs:
+        try:
+            pull_request = job.project_repo.get_pull_request(pr_id)
+        except Exception as err:
+            LOG.warning("Queue babysit: failed to get PR %d: %s", pr_id, err)
+            continue
+
+        # Check if this PR has babysit enabled
+        if not _check_pr_babysit_enabled(pull_request, job.settings):
+            LOG.debug("Queue babysit: PR %d does not have babysit enabled",
+                      pr_id)
+            continue
+
+        # Find the queue integration branches for this PR
+        for version in queues._queues.keys():
+            qints = queues._queues[version][QueueIntegrationBranch]
+            for qint in qints:
+                if qint.pr_id != pr_id:
+                    continue
+
+                # Check build status on this queue branch
+                commit_sha = qint.get_latest_commit()
+                status = queues.bbrepo.get_build_status(commit_sha,
+                                                        queues.build_key)
+                if status != 'FAILED':
+                    continue
+
+                LOG.info("Queue babysit: checking failed build on %s for PR %d",
+                         qint.name, pr_id)
+
+                # Create a temporary job-like object for the shared babysit logic
+                temp_job = SimpleNamespace(
+                    settings=job.settings,
+                    project_repo=job.project_repo,
+                    active_options=['babysit'],
+                    pull_request=pull_request,
+                )
+
+                try:
+                    # Use the shared babysit logic
+                    handle_babysit_retry(
+                        temp_job, qint, queues.build_key,
+                        pull_request=pull_request
+                    )
+                except exceptions.BabysitRetry as retry_exc:
+                    # Notify the PR about the retry
+                    notify_user(job.settings, pull_request, retry_exc)
+                    retried_any = True
+                except exceptions.BabysitExhausted as exhausted_exc:
+                    # Notify the PR about exhaustion
+                    notify_user(job.settings, pull_request, exhausted_exc)
+                    retried_any = True  # We handled it, just not with a retry
+                except exceptions.BabysitCancelled as cancelled_exc:
+                    # Notify the PR that babysit was cancelled due to new commits
+                    notify_user(job.settings, pull_request, cancelled_exc)
+                    # Don't set retried_any - let normal failure handling proceed
+
+    return retried_any
+
+
 def notify_queue_build_failed(failed_prs: List[int], job: QueuesJob):
     """Notify on the pull request that the queue build failed."""
     # TODO: As this feature evolves, we might want to include
@@ -73,6 +203,11 @@ def handle_merge_queues(job):
         if not failed_prs:
             raise exceptions.NothingToDo()
         else:
+            # Check if babysit should handle the failed queue builds
+            if _handle_queue_babysit_retry(job, queues, failed_prs):
+                # Babysit triggered retries, raise BuildInProgress to wait
+                raise exceptions.BuildInProgress()
+
             notify_queue_build_failed(failed_prs, job)
             raise exceptions.QueueBuildFailed()
 

From 608657bdc9a421486113a7d6b661d25f5aa5dcbd Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Watenberg <jean-baptiste.watenberg@scality.com>
Date: Mon, 12 Jan 2026 10:36:12 +0100
Subject: [PATCH 2/2] Fix linting and tests

---
 bert_e/git_host/github/__init__.py       |  3 +-
 bert_e/tests/unit/test_babysit.py        | 52 +++++++++++++++---------
 bert_e/workflow/gitwaterflow/__init__.py |  3 +-
 bert_e/workflow/gitwaterflow/babysit.py  | 11 ++---
 bert_e/workflow/gitwaterflow/queueing.py | 22 ++++++----
 5 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/bert_e/git_host/github/__init__.py b/bert_e/git_host/github/__init__.py
index 767cb5b8..df506627 100644
--- a/bert_e/git_host/github/__init__.py
+++ b/bert_e/git_host/github/__init__.py
@@ -525,7 +525,8 @@ def rerun_failed_workflow_jobs(self, run_id: int) -> None:
             run_id: The ID of the workflow run to re-run failed jobs for.
 
         """
-        url = f'/repos/{self.owner}/{self.slug}/actions/runs/{run_id}/rerun-failed-jobs'
+        url = (f'/repos/{self.owner}/{self.slug}/actions/runs/'
+               f'{run_id}/rerun-failed-jobs')
         self.client.post(url, data='{}')
 
 
diff --git a/bert_e/tests/unit/test_babysit.py b/bert_e/tests/unit/test_babysit.py
index 038e5af0..9b2a91b2 100644
--- a/bert_e/tests/unit/test_babysit.py
+++ b/bert_e/tests/unit/test_babysit.py
@@ -27,6 +27,11 @@
 from bert_e.workflow.gitwaterflow.queueing import (
     _check_pr_babysit_enabled, _handle_queue_babysit_retry
 )
+# Import setup to register reactor options
+from bert_e.workflow.gitwaterflow.commands import setup as gwf_setup
+
+# Call setup to register all options including babysit
+gwf_setup()
 
 
 @pytest.fixture
@@ -42,7 +47,7 @@ def client():
 
 @pytest.fixture
 def failed_workflow_run_json():
-    """Workflow run JSON with a failed run on an integration branch (w/)."""
+    """Workflow run JSON with failed run on an integration branch."""
     return {
         'workflow_runs': [
             {
@@ -56,7 +61,7 @@ def failed_workflow_run_json():
                 'conclusion': 'failure',
                 'run_attempt': 2,
                 'name': 'CI Build',
-                'html_url': 'https://github.com/octo-org/Hello-World/actions/runs/12345',
+                'html_url': 'https://github.com/org/repo/actions/runs/12345',
                 'repository': {
                     'full_name': 'octo-org/Hello-World',
                     'owner': {'login': 'octo-org'},
@@ -406,8 +411,8 @@ def _make_retry_comment(self, branch_name, commit, workflows):
             lines.append(f'| `{wf_name}` | {retry_count}/{max_retries} |')
         return '\n'.join(lines)
 
-    def _make_job(self, babysit=False, host='github', build_key='github_actions',
-                  max_retries=5, comments=None):
+    def _make_job(self, babysit=False, host='github',
+                  build_key='github_actions', max_retries=5, comments=None):
         """Create a mock job with settings."""
         settings = SimpleNamespace(
             babysit=babysit,
@@ -549,15 +554,19 @@ def test_workflow_exhausted_individually(self, client):
             self._make_comment('user', '@bert-e babysit'),
         ]
         for i in range(5):
-            comments.append(self._make_comment('bert-e', self._make_retry_comment(
-                branch_name, commit,
-                [('CI Build', i + 1, 5)]
-            )))
+            comments.append(self._make_comment(
+                'bert-e',
+                self._make_retry_comment(
+                    branch_name, commit, [('CI Build', i + 1, 5)]
+                )
+            ))
         for i in range(2):
-            comments.append(self._make_comment('bert-e', self._make_retry_comment(
-                branch_name, commit,
-                [('Tests', i + 1, 5)]
-            )))
+            comments.append(self._make_comment(
+                'bert-e',
+                self._make_retry_comment(
+                    branch_name, commit, [('Tests', i + 1, 5)]
+                )
+            ))
 
         job = self._make_job(babysit=True, max_retries=5, comments=comments)
         branch = self._make_branch(commit=commit, name=branch_name)
@@ -610,7 +619,9 @@ def test_workflow_exhausted_individually(self, client):
                 handle_babysit_retry(job, branch, 'github_actions')
 
         # Only Tests should be retried (CI Build is exhausted)
-        job.project_repo.rerun_failed_workflow_jobs.assert_called_once_with(22222)
+        job.project_repo.rerun_failed_workflow_jobs.assert_called_once_with(
+            22222
+        )
 
         # Check that only Tests is in the retry list
         workflows = exc_info.value.kwargs['workflows']
@@ -619,7 +630,7 @@ def test_workflow_exhausted_individually(self, client):
         assert workflows[0]['retry_count'] == 3
 
     def test_all_workflows_exhausted(self, client):
-        """Test that BabysitExhausted is raised when all workflows exhausted."""
+        """Test BabysitExhausted raised when all workflows exhausted."""
         branch_name = 'w/5.0/feature/test'
         commit = 'abc1234567890'
 
@@ -628,10 +639,13 @@ def test_all_workflows_exhausted(self, client):
             self._make_comment('user', '@bert-e babysit'),
         ]
         for i in range(5):
-            comments.append(self._make_comment('bert-e', self._make_retry_comment(
-                branch_name, commit,
-                [('CI Build', i + 1, 5), ('Tests', i + 1, 5)]
-            )))
+            comments.append(self._make_comment(
+                'bert-e',
+                self._make_retry_comment(
+                    branch_name, commit,
+                    [('CI Build', i + 1, 5), ('Tests', i + 1, 5)]
+                )
+            ))
 
         job = self._make_job(babysit=True, max_retries=5, comments=comments)
         branch = self._make_branch(commit=commit, name=branch_name)
@@ -772,7 +786,7 @@ def test_queue_babysit_skips_non_github(self):
         assert result is False
 
     def test_queue_babysit_skips_non_github_actions(self):
-        """Test queue babysit is skipped for non-github_actions build key."""
+        """Test queue babysit skipped for non-github_actions build key."""
         settings = SimpleNamespace(
             repository_host='github',
             max_babysit_retries=5,
diff --git a/bert_e/workflow/gitwaterflow/__init__.py b/bert_e/workflow/gitwaterflow/__init__.py
index 3e02636c..3adf3691 100644
--- a/bert_e/workflow/gitwaterflow/__init__.py
+++ b/bert_e/workflow/gitwaterflow/__init__.py
@@ -42,6 +42,7 @@
                           update_integration_branches)
 from .jira import jira_checks
 from . import queueing
+from .babysit import handle_babysit_retry as _handle_babysit_retry
 
 
 LOG = logging.getLogger(__name__)
@@ -675,5 +676,3 @@ def status(branch):
     assert worst_status == 'SUCCESSFUL'
 
 
-# Import the shared babysit module
-from .babysit import handle_babysit_retry as _handle_babysit_retry
diff --git a/bert_e/workflow/gitwaterflow/babysit.py b/bert_e/workflow/gitwaterflow/babysit.py
index 887e6ced..0ea160dd 100644
--- a/bert_e/workflow/gitwaterflow/babysit.py
+++ b/bert_e/workflow/gitwaterflow/babysit.py
@@ -52,8 +52,8 @@
 WORKFLOW_RETRY_PATTERN = re.compile(r'\| `([^`]+)` \| (\d+)/(\d+) \|')
 
 
-def count_babysit_retries_per_workflow(pull_request, robot_name, branch_name,
-                                        current_commit):
+def count_babysit_retries_per_workflow(pull_request, robot_name,
+                                       branch_name, current_commit):
     """Count how many babysit retries have been done for each workflow.
 
     This parses BabysitRetry comments posted by Bert-E for the specific branch
@@ -107,8 +107,8 @@ def count_babysit_retries_per_workflow(pull_request, robot_name, branch_name,
                 # Extract workflow retry counts from the table
                 for wf_match in WORKFLOW_RETRY_PATTERN.finditer(text):
                     workflow_name = wf_match.group(1)
-                    # The retry count in the message is the current retry number
-                    # We just need to track that this workflow was retried
+                    # The retry count in the message is the current number
+                    # We just track that this workflow was retried
                     workflow_retries[workflow_name] += 1
         else:
             # Check if user sent /babysit command - this resets all counts
@@ -276,7 +276,8 @@ def handle_babysit_retry(job, failed_branch, build_key, pull_request=None):
                         wf['id'], wf['name'], branch_name, err)
 
     if not rerun_triggered:
-        LOG.warning("Babysit: could not trigger any reruns for %s", branch_name)
+        LOG.warning("Babysit: could not trigger any reruns for %s",
+                    branch_name)
         return False
 
     # Raise BabysitRetry with per-workflow information
diff --git a/bert_e/workflow/gitwaterflow/queueing.py b/bert_e/workflow/gitwaterflow/queueing.py
index 22b5b5bb..902e2424 100644
--- a/bert_e/workflow/gitwaterflow/queueing.py
+++ b/bert_e/workflow/gitwaterflow/queueing.py
@@ -30,7 +30,7 @@
                        QueueIntegrationBranch, branch_factory,
                        build_queue_collection)
 from .integration import get_integration_branches
-from typing import List, Dict, Any
+from typing import List
 
 
 LOG = logging.getLogger(__name__)
@@ -61,7 +61,8 @@ def _check_pr_babysit_enabled(pull_request, settings) -> bool:
         authored = author == pull_request.author
         text = comment.text
         try:
-            reactor.handle_options(temp_job, text, prefix, privileged, authored)
+            reactor.handle_options(
+                temp_job, text, prefix, privileged, authored)
         except Exception:
             # Ignore errors, we just want to check for babysit
             pass
@@ -69,8 +70,9 @@ def _check_pr_babysit_enabled(pull_request, settings) -> bool:
     return temp_job.settings.get('babysit', False)
 
 
-def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection,
-                                 failed_prs: List[int]) -> bool:
+def _handle_queue_babysit_retry(job: QueuesJob,
+                                queues: QueueCollection,
+                                failed_prs: List[int]) -> bool:
     """Handle babysit retry logic for failed queue builds.
 
     For each failed PR in the queue that has babysit enabled, this function
@@ -131,8 +133,9 @@ def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection,
                 if status != 'FAILED':
                     continue
 
-                LOG.info("Queue babysit: checking failed build on %s for PR %d",
-                         qint.name, pr_id)
+                LOG.info(
+                    "Queue babysit: checking failed build on %s for PR %d",
+                    qint.name, pr_id)
 
                 # Create a temporary job-like object for the shared babysit logic
                 temp_job = SimpleNamespace(
@@ -155,11 +158,12 @@ def _handle_queue_babysit_retry(job: QueuesJob, queues: QueueCollection,
                 except exceptions.BabysitExhausted as exhausted_exc:
                     # Notify the PR about exhaustion
                     notify_user(job.settings, pull_request, exhausted_exc)
-                    retried_any = True  # We handled it, just not with a retry
+                    # We handled it, just not with a retry
+                    retried_any = True
                 except exceptions.BabysitCancelled as cancelled_exc:
-                    # Notify the PR that babysit was cancelled due to new commits
+                    # Babysit cancelled due to new commits
                     notify_user(job.settings, pull_request, cancelled_exc)
-                    # Don't set retried_any - let normal failure handling proceed
+                    # Don't set retried_any - normal failure handling
 
     return retried_any