From 59544ab18abd559c29bd1fb669b5ad6b324f8fb5 Mon Sep 17 00:00:00 2001 From: Mark Story Date: Thu, 26 Mar 2026 10:56:36 -0400 Subject: [PATCH 1/4] fix(cells) Don't record proxied request failures towards circuit breaker We don't want to trip the breaker when the proxied requests fail with 500s. We're mostly interested in preventing resource exhaustion from slow requests/connection errors here. Refs INFRENG-275 --- src/sentry/hybridcloud/apigateway/proxy.py | 3 +-- tests/sentry/hybridcloud/apigateway/test_proxy.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sentry/hybridcloud/apigateway/proxy.py b/src/sentry/hybridcloud/apigateway/proxy.py index 67b62863da0aa7..9f9549f55dab9f 100644 --- a/src/sentry/hybridcloud/apigateway/proxy.py +++ b/src/sentry/hybridcloud/apigateway/proxy.py @@ -199,9 +199,8 @@ def proxy_cell_request(request: HttpRequest, cell: Cell, url_name: str) -> HttpR # remote silo timeout. Use DRF timeout instead raise RequestTimeout() - if resp.status_code >= 500 and circuit_breaker is not None: + if resp.status_code >= 500: metrics.incr("apigateway.proxy.request_failed", tags=metric_tags) - circuit_breaker.record_error() new_headers = clean_outbound_headers(resp.headers) resp.headers.clear() diff --git a/tests/sentry/hybridcloud/apigateway/test_proxy.py b/tests/sentry/hybridcloud/apigateway/test_proxy.py index 1f9c73be3c2584..f1f65e5259ba44 100644 --- a/tests/sentry/hybridcloud/apigateway/test_proxy.py +++ b/tests/sentry/hybridcloud/apigateway/test_proxy.py @@ -352,7 +352,7 @@ def test_timeout_records_error(self) -> None: @responses.activate @override_options(CB_ENABLED) - def test_5xx_response_records_error(self) -> None: + def test_5xx_response_does_not_record_error(self) -> None: responses.add( responses.GET, f"{self.REGION.address}/server-error", @@ -366,7 +366,7 @@ def test_5xx_response_records_error(self) -> None: request = RequestFactory().get("http://sentry.io/server-error") resp = proxy_request(request, self.organization.slug, url_name) assert resp.status_code == 500 - mock_breaker.record_error.assert_called_once() + mock_breaker.record_error.assert_not_called() @responses.activate @override_options(CB_ENABLED) From 7ba1259943cbef1c04833101fc09c8c22e54db45 Mon Sep 17 00:00:00 2001 From: Mark Story Date: Thu, 26 Mar 2026 12:21:52 -0400 Subject: [PATCH 2/4] Add ConnectionError to apigateway circuit breaker --- src/sentry/hybridcloud/apigateway/proxy.py | 11 +++++- .../hybridcloud/apigateway/test_proxy.py | 35 ++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/sentry/hybridcloud/apigateway/proxy.py b/src/sentry/hybridcloud/apigateway/proxy.py index 9f9549f55dab9f..4499421c16d47d 100644 --- a/src/sentry/hybridcloud/apigateway/proxy.py +++ b/src/sentry/hybridcloud/apigateway/proxy.py @@ -14,7 +14,7 @@ from django.http.response import HttpResponseBase from requests import Response as ExternalResponse from requests import request as external_request -from requests.exceptions import Timeout +from requests.exceptions import ConnectionError, Timeout from sentry import options from sentry.api.exceptions import RequestTimeout @@ -198,6 +198,15 @@ def proxy_cell_request(request: HttpRequest, cell: Cell, url_name: str) -> HttpR # remote silo timeout. Use DRF timeout instead raise RequestTimeout() + except ConnectionError: + metrics.incr("apigateway.proxy.connection_error", tags=metric_tags) + try: + if circuit_breaker is not None: + circuit_breaker.record_error() + except Exception: + logger.exception("Failed to record circuitbreaker failure") + + raise if resp.status_code >= 500: metrics.incr("apigateway.proxy.request_failed", tags=metric_tags) diff --git a/tests/sentry/hybridcloud/apigateway/test_proxy.py b/tests/sentry/hybridcloud/apigateway/test_proxy.py index f1f65e5259ba44..8adedb14ed6c22 100644 --- a/tests/sentry/hybridcloud/apigateway/test_proxy.py +++ b/tests/sentry/hybridcloud/apigateway/test_proxy.py @@ -7,7 +7,7 @@ from django.core.files.uploadedfile import SimpleUploadedFile from django.http import HttpResponse from django.test.client import RequestFactory -from requests.exceptions import Timeout +from requests.exceptions import ConnectionError, Timeout from sentry.api.exceptions import RequestTimeout from sentry.hybridcloud.apigateway.proxy import proxy_request @@ -350,6 +350,39 @@ def test_timeout_records_error(self) -> None: proxy_request(request, self.organization.slug, url_name) mock_breaker.record_error.assert_called_once() + @responses.activate + @override_options(CB_ENABLED) + def test_connection_error_records_error(self) -> None: + responses.add( + responses.GET, + f"{self.REGION.address}/connect-error", + body=ConnectionError(), + ) + with patch("sentry.hybridcloud.apigateway.proxy.CircuitBreaker") as mock_breaker_class: + mock_breaker = self._make_breaker_mock(allow_request=True) + mock_breaker_class.return_value = mock_breaker + request = RequestFactory().get("http://sentry.io/connect-error") + with pytest.raises(ConnectionError): + proxy_request(request, self.organization.slug, url_name) + mock_breaker.record_error.assert_called_once() + + @responses.activate + @override_options(CB_ENABLED) + def test_connection_error_records_metric(self) -> None: + responses.add( + responses.GET, + f"{self.REGION.address}/connect-error", + body=ConnectionError(), + ) + with patch("sentry.hybridcloud.apigateway.proxy.metrics") as mock_metrics: + request = RequestFactory().get("http://sentry.io/connect-error") + with pytest.raises(ConnectionError): + proxy_request(request, self.organization.slug, url_name) + mock_metrics.incr.assert_any_call( + "apigateway.proxy.connection_error", + tags={"region": self.REGION.name, "url_name": url_name}, + ) + @responses.activate @override_options(CB_ENABLED) def test_5xx_response_does_not_record_error(self) -> None: From 7b3b7cf9c7631e864bdc5c10b610cab0c3cfb0ae Mon Sep 17 00:00:00 2001 From: Mark Story Date: Fri, 27 Mar 2026 11:44:10 -0400 Subject: [PATCH 3/4] Still record circuit breaker hits on 503+ These errors indicate failures from downstream loadbalancers that indicate that the downstream application is gone/timing out. --- src/sentry/hybridcloud/apigateway/proxy.py | 18 +++++-------- .../hybridcloud/apigateway/test_proxy.py | 26 ++++++++++++++++--- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/sentry/hybridcloud/apigateway/proxy.py b/src/sentry/hybridcloud/apigateway/proxy.py index 1d763ce569003b..9f13ce9b8a6f2c 100644 --- a/src/sentry/hybridcloud/apigateway/proxy.py +++ b/src/sentry/hybridcloud/apigateway/proxy.py @@ -193,26 +193,22 @@ def proxy_cell_request(request: HttpRequest, cell: Cell, url_name: str) -> HttpR ) except Timeout: metrics.incr("apigateway.proxy.request_timeout", tags=metric_tags) - try: - if circuit_breaker is not None: - circuit_breaker.record_error() - except Exception: - logger.exception("Failed to record circuitbreaker failure") + if circuit_breaker is not None: + circuit_breaker.record_error() # remote silo timeout. Use DRF timeout instead raise RequestTimeout() except ConnectionError: metrics.incr("apigateway.proxy.connection_error", tags=metric_tags) - try: - if circuit_breaker is not None: - circuit_breaker.record_error() - except Exception: - logger.exception("Failed to record circuitbreaker failure") + if circuit_breaker is not None: + circuit_breaker.record_error() raise - if resp.status_code >= 500: + if resp.status_code >= 502: metrics.incr("apigateway.proxy.request_failed", tags=metric_tags) + if circuit_breaker is not None: + circuit_breaker.record_error() new_headers = clean_outbound_headers(resp.headers) resp.headers.clear() diff --git a/tests/sentry/hybridcloud/apigateway/test_proxy.py b/tests/sentry/hybridcloud/apigateway/test_proxy.py index 0c9b1d20147a17..4b637b8973a848 100644 --- a/tests/sentry/hybridcloud/apigateway/test_proxy.py +++ b/tests/sentry/hybridcloud/apigateway/test_proxy.py @@ -355,7 +355,7 @@ def test_timeout_records_error(self) -> None: def test_connection_error_records_error(self) -> None: responses.add( responses.GET, - f"{self.REGION.address}/connect-error", + f"{self.CELL.address}/connect-error", body=ConnectionError(), ) with patch("sentry.hybridcloud.apigateway.proxy.CircuitBreaker") as mock_breaker_class: @@ -371,7 +371,7 @@ def test_connection_error_records_error(self) -> None: def test_connection_error_records_metric(self) -> None: responses.add( responses.GET, - f"{self.REGION.address}/connect-error", + f"{self.CELL.address}/connect-error", body=ConnectionError(), ) with patch("sentry.hybridcloud.apigateway.proxy.metrics") as mock_metrics: @@ -380,12 +380,30 @@ def test_connection_error_records_metric(self) -> None: proxy_request(request, self.organization.slug, url_name) mock_metrics.incr.assert_any_call( "apigateway.proxy.connection_error", - tags={"region": self.REGION.name, "url_name": url_name}, + tags={"region": self.CELL.name, "url_name": url_name}, ) @responses.activate @override_options(CB_ENABLED) - def test_5xx_response_does_not_record_error(self) -> None: + def test_504_response_does_not_record_error(self) -> None: + responses.add( + responses.GET, + f"{self.CELL.address}/server-error", + status=504, + body=json.dumps({"detail": "gateway timeout"}), + content_type="application/json", + ) + with patch("sentry.hybridcloud.apigateway.proxy.CircuitBreaker") as mock_breaker_class: + mock_breaker = self._make_breaker_mock(allow_request=True) + mock_breaker_class.return_value = mock_breaker + request = RequestFactory().get("http://sentry.io/server-error") + resp = proxy_request(request, self.organization.slug, url_name) + assert resp.status_code == 504 + mock_breaker.record_error.assert_called_once() + + @responses.activate + @override_options(CB_ENABLED) + def test_500_response_does_not_record_error(self) -> None: responses.add( responses.GET, f"{self.CELL.address}/server-error", From f538f1467f39d69ee322e064e956dc85c232e489 Mon Sep 17 00:00:00 2001 From: Mark Story Date: Fri, 27 Mar 2026 12:52:13 -0400 Subject: [PATCH 4/4] Fix method name --- tests/sentry/hybridcloud/apigateway/test_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sentry/hybridcloud/apigateway/test_proxy.py b/tests/sentry/hybridcloud/apigateway/test_proxy.py index 4b637b8973a848..c8a66042dfc74d 100644 --- a/tests/sentry/hybridcloud/apigateway/test_proxy.py +++ b/tests/sentry/hybridcloud/apigateway/test_proxy.py @@ -385,7 +385,7 @@ def test_connection_error_records_metric(self) -> None: @responses.activate @override_options(CB_ENABLED) - def test_504_response_does_not_record_error(self) -> None: + def test_504_response_does_record_error(self) -> None: responses.add( responses.GET, f"{self.CELL.address}/server-error",