diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index 9e8ea5ec..36510f16 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -131,14 +131,51 @@ def safe_db_commit(db, operation_description: str = "database operation") -> boo "The test will be retried automatically when resources become available." ), 'QUOTA_EXCEEDED': ( - "GCP quota limit reached. Please wait for other tests to complete " - "or contact the administrator." + "GCP quota limit reached. " + "The test will be retried automatically when resources become available." ), 'RESOURCE_NOT_FOUND': "Required GCP resource not found. Please contact the administrator.", 'RESOURCE_ALREADY_EXISTS': "A VM with this name already exists. Please contact the administrator.", 'TIMEOUT': "GCP operation timed out. The test will be retried automatically.", } +# GCP error codes that are transient and should be retried automatically. +# Tests encountering these errors will remain pending and be picked up on the next cron run. +GCP_RETRYABLE_ERRORS = { + 'ZONE_RESOURCE_POOL_EXHAUSTED', + 'QUOTA_EXCEEDED', +} + + +def get_gcp_error_code(result: Dict) -> Optional[str]: + """ + Extract the error code from a GCP API error response. + + :param result: The GCP API response dictionary + :return: The error code string, or None if not found + """ + if not isinstance(result, dict): + return None + error = result.get('error') + if not isinstance(error, dict): + return None + errors = error.get('errors', []) + if not errors or not isinstance(errors, list): + return None + first_error = errors[0] if len(errors) > 0 else {} + return first_error.get('code') + + +def is_retryable_gcp_error(result: Dict) -> bool: + """ + Check if a GCP error is transient and should be retried. + + :param result: The GCP API response dictionary + :return: True if the error is retryable, False otherwise + """ + error_code = get_gcp_error_code(result) + return error_code in GCP_RETRYABLE_ERRORS + def parse_gcp_error(result: Dict, log=None) -> str: """ @@ -929,6 +966,10 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to # Check if the create_instance call itself returned an error (synchronous failure) if 'error' in operation: error_msg = parse_gcp_error(operation) + if is_retryable_gcp_error(operation): + # Transient error - leave test pending for retry on next cron run + log.warning(f"Test {test.id}: VM creation hit retryable error, will retry: {error_msg}") + return log.error(f"Error creating test instance for test {test.id}, result: {operation}") mark_test_failed(db, test, repository, error_msg) return @@ -947,6 +988,10 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to error_msg = parse_gcp_error(result) log.error(f"Test {test.id}: VM creation failed: {error_msg}") log.error(f"Test {test.id}: Full GCP response: {result}") + if is_retryable_gcp_error(result): + # Transient error - leave test pending for retry on next cron run + log.info(f"Test {test.id}: Error is retryable, will retry on next cron run") + return mark_test_failed(db, test, repository, error_msg) return diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py index d74303a0..24643f00 100644 --- a/tests/test_ci/test_controllers.py +++ b/tests/test_ci/test_controllers.py @@ -3901,14 +3901,14 @@ def _setup_start_test_mocks(self, mock_g, mock_gcp_instance, mock_test_progress, @mock.patch('mod_ci.controllers.g') @mock.patch('mod_ci.controllers.TestProgress') @mock.patch('mod_ci.controllers.GcpInstance') - def test_start_test_quota_exceeded_no_db_record( + def test_start_test_quota_exceeded_retries_instead_of_failing( self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, mock_zipfile, mock_requests_get, mock_find_artifact, mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log): - """Test that QUOTA_EXCEEDED error prevents gcp_instance record creation. + """Test that QUOTA_EXCEEDED error leaves test pending for retry. - This is the exact scenario from test #7768: GCP operation returns - successfully but completes with QUOTA_EXCEEDED error. + QUOTA_EXCEEDED is a transient error - the test should remain pending + and be retried on the next cron run, not marked as failed. """ from mod_ci.controllers import start_test @@ -3925,7 +3925,49 @@ def test_start_test_quota_exceeded_no_db_record( start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token") + # QUOTA_EXCEEDED is retryable, so mark_test_failed should NOT be called + mock_mark_failed.assert_not_called() + # No GcpInstance record should be created + mock_g.db.add.assert_not_called() + + @mock.patch('run.log') + @mock.patch('mod_ci.controllers.mark_test_failed') + @mock.patch('mod_ci.controllers.wait_for_operation') + @mock.patch('mod_ci.controllers.create_instance') + @mock.patch('mod_ci.controllers.find_artifact_for_commit') + @mock.patch('mod_ci.controllers.requests.get') + @mock.patch('mod_ci.controllers.zipfile.ZipFile') + @mock.patch('builtins.open', new_callable=mock.mock_open()) + @mock.patch('mod_ci.controllers.g') + @mock.patch('mod_ci.controllers.TestProgress') + @mock.patch('mod_ci.controllers.GcpInstance') + def test_start_test_non_retryable_error_marks_failed( + self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, + mock_zipfile, mock_requests_get, mock_find_artifact, + mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log): + """Test that non-retryable errors (like RESOURCE_NOT_FOUND) mark test as failed. + + Only transient errors like QUOTA_EXCEEDED should be retried. Permanent + errors like RESOURCE_NOT_FOUND should immediately mark the test as failed. + """ + from mod_ci.controllers import start_test + + self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress, + mock_find_artifact, mock_requests_get, mock_zipfile) + + # VM creation returns operation ID, but wait_for_operation returns non-retryable error + mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'} + mock_wait_for_operation.return_value = { + 'status': 'DONE', + 'error': {'errors': [{'code': 'RESOURCE_NOT_FOUND', 'message': 'Resource not found'}]}, + 'httpErrorStatusCode': 404 + } + + start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token") + + # RESOURCE_NOT_FOUND is NOT retryable, so mark_test_failed SHOULD be called mock_mark_failed.assert_called_once() + # No GcpInstance record should be created mock_g.db.add.assert_not_called() @mock.patch('run.log')