From 424368816af98ba435a500fe060584410168e7c5 Mon Sep 17 00:00:00 2001
From: Carlos <carlos@sanz.dev>
Date: Sun, 18 Jan 2026 17:36:09 -0800
Subject: [PATCH] fix: retry tests on transient GCP errors instead of failing

QUOTA_EXCEEDED and ZONE_RESOURCE_POOL_EXHAUSTED are transient errors
that will resolve when other tests complete or resources become available.

Previously, these errors would mark the test as failed with an error
status on GitHub. This was confusing because:
1. The error message said "will be retried" but the test was already failed
2. Users saw a red X on their PR for infrastructure issues, not code problems

Now, transient GCP errors leave the test pending so it will be retried
on the next cron run. Only permanent errors (like RESOURCE_NOT_FOUND)
mark the test as failed.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 mod_ci/controllers.py             | 49 ++++++++++++++++++++++++++++--
 tests/test_ci/test_controllers.py | 50 ++++++++++++++++++++++++++++---
 2 files changed, 93 insertions(+), 6 deletions(-)

diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py
index 9e8ea5ec..36510f16 100755
--- a/mod_ci/controllers.py
+++ b/mod_ci/controllers.py
@@ -131,14 +131,51 @@ def safe_db_commit(db, operation_description: str = "database operation") -> boo
         "The test will be retried automatically when resources become available."
     ),
     'QUOTA_EXCEEDED': (
-        "GCP quota limit reached. Please wait for other tests to complete "
-        "or contact the administrator."
+        "GCP quota limit reached. "
+        "The test will be retried automatically when resources become available."
     ),
     'RESOURCE_NOT_FOUND': "Required GCP resource not found. Please contact the administrator.",
     'RESOURCE_ALREADY_EXISTS': "A VM with this name already exists. Please contact the administrator.",
     'TIMEOUT': "GCP operation timed out. The test will be retried automatically.",
 }
 
+# GCP error codes that are transient and should be retried automatically.
+# Tests encountering these errors will remain pending and be picked up on the next cron run.
+GCP_RETRYABLE_ERRORS = {
+    'ZONE_RESOURCE_POOL_EXHAUSTED',
+    'QUOTA_EXCEEDED',
+}
+
+
+def get_gcp_error_code(result: Dict) -> Optional[str]:
+    """
+    Extract the error code from a GCP API error response.
+
+    :param result: The GCP API response dictionary
+    :return: The error code string, or None if not found
+    """
+    if not isinstance(result, dict):
+        return None
+    error = result.get('error')
+    if not isinstance(error, dict):
+        return None
+    errors = error.get('errors', [])
+    if not errors or not isinstance(errors, list):
+        return None
+    first_error = errors[0] if len(errors) > 0 else {}
+    return first_error.get('code')
+
+
+def is_retryable_gcp_error(result: Dict) -> bool:
+    """
+    Check if a GCP error is transient and should be retried.
+
+    :param result: The GCP API response dictionary
+    :return: True if the error is retryable, False otherwise
+    """
+    error_code = get_gcp_error_code(result)
+    return error_code in GCP_RETRYABLE_ERRORS
+
 
 def parse_gcp_error(result: Dict, log=None) -> str:
     """
@@ -929,6 +966,10 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
     # Check if the create_instance call itself returned an error (synchronous failure)
     if 'error' in operation:
         error_msg = parse_gcp_error(operation)
+        if is_retryable_gcp_error(operation):
+            # Transient error - leave test pending for retry on next cron run
+            log.warning(f"Test {test.id}: VM creation hit retryable error, will retry: {error_msg}")
+            return
         log.error(f"Error creating test instance for test {test.id}, result: {operation}")
         mark_test_failed(db, test, repository, error_msg)
         return
@@ -947,6 +988,10 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
         error_msg = parse_gcp_error(result)
         log.error(f"Test {test.id}: VM creation failed: {error_msg}")
         log.error(f"Test {test.id}: Full GCP response: {result}")
+        if is_retryable_gcp_error(result):
+            # Transient error - leave test pending for retry on next cron run
+            log.info(f"Test {test.id}: Error is retryable, will retry on next cron run")
+            return
         mark_test_failed(db, test, repository, error_msg)
         return
 
diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py
index d74303a0..24643f00 100644
--- a/tests/test_ci/test_controllers.py
+++ b/tests/test_ci/test_controllers.py
@@ -3901,14 +3901,14 @@ def _setup_start_test_mocks(self, mock_g, mock_gcp_instance, mock_test_progress,
     @mock.patch('mod_ci.controllers.g')
     @mock.patch('mod_ci.controllers.TestProgress')
     @mock.patch('mod_ci.controllers.GcpInstance')
-    def test_start_test_quota_exceeded_no_db_record(
+    def test_start_test_quota_exceeded_retries_instead_of_failing(
             self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
             mock_zipfile, mock_requests_get, mock_find_artifact,
             mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log):
-        """Test that QUOTA_EXCEEDED error prevents gcp_instance record creation.
+        """Test that QUOTA_EXCEEDED error leaves test pending for retry.
 
-        This is the exact scenario from test #7768: GCP operation returns
-        successfully but completes with QUOTA_EXCEEDED error.
+        QUOTA_EXCEEDED is a transient error - the test should remain pending
+        and be retried on the next cron run, not marked as failed.
         """
         from mod_ci.controllers import start_test
 
@@ -3925,7 +3925,49 @@ def test_start_test_quota_exceeded_no_db_record(
 
         start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token")
 
+        # QUOTA_EXCEEDED is retryable, so mark_test_failed should NOT be called
+        mock_mark_failed.assert_not_called()
+        # No GcpInstance record should be created
+        mock_g.db.add.assert_not_called()
+
+    @mock.patch('run.log')
+    @mock.patch('mod_ci.controllers.mark_test_failed')
+    @mock.patch('mod_ci.controllers.wait_for_operation')
+    @mock.patch('mod_ci.controllers.create_instance')
+    @mock.patch('mod_ci.controllers.find_artifact_for_commit')
+    @mock.patch('mod_ci.controllers.requests.get')
+    @mock.patch('mod_ci.controllers.zipfile.ZipFile')
+    @mock.patch('builtins.open', new_callable=mock.mock_open())
+    @mock.patch('mod_ci.controllers.g')
+    @mock.patch('mod_ci.controllers.TestProgress')
+    @mock.patch('mod_ci.controllers.GcpInstance')
+    def test_start_test_non_retryable_error_marks_failed(
+            self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
+            mock_zipfile, mock_requests_get, mock_find_artifact,
+            mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log):
+        """Test that non-retryable errors (like RESOURCE_NOT_FOUND) mark test as failed.
+
+        Only transient errors like QUOTA_EXCEEDED should be retried. Permanent
+        errors like RESOURCE_NOT_FOUND should immediately mark the test as failed.
+        """
+        from mod_ci.controllers import start_test
+
+        self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress,
+                                     mock_find_artifact, mock_requests_get, mock_zipfile)
+
+        # VM creation returns operation ID, but wait_for_operation returns non-retryable error
+        mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'}
+        mock_wait_for_operation.return_value = {
+            'status': 'DONE',
+            'error': {'errors': [{'code': 'RESOURCE_NOT_FOUND', 'message': 'Resource not found'}]},
+            'httpErrorStatusCode': 404
+        }
+
+        start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token")
+
+        # RESOURCE_NOT_FOUND is NOT retryable, so mark_test_failed SHOULD be called
         mock_mark_failed.assert_called_once()
+        # No GcpInstance record should be created
         mock_g.db.add.assert_not_called()
 
     @mock.patch('run.log')