-
-
Notifications
You must be signed in to change notification settings - Fork 4.7k
feat(sentry apps): Add circuit breaker into webhook code #111723
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8e9634f
cb4c18f
62644f7
746fdf6
099cf92
93fd2c1
21d422c
417e8fc
cf8357b
16f2113
1d1b904
3fa148b
791699d
e755d7f
0efb782
75f1fcf
9965fa0
7e4e58b
7f11bed
3acaeda
02bd807
ff24594
0c41b3e
c492fb5
94f985d
a01f42c
8901b14
07d4036
ab52d21
85538e5
1126a3d
0c02937
000127d
26eee2c
3688925
c681859
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,39 @@ | ||
| import logging | ||
| from collections.abc import Generator | ||
| from contextlib import contextmanager | ||
|
|
||
| from sentry.utils.circuit_breaker2 import CircuitBreaker | ||
|
|
||
| logger = logging.getLogger("sentry.sentry_apps.circuit_breaker") | ||
|
|
||
|
|
||
| @contextmanager | ||
| def circuit_breaker_tracking( | ||
| breaker: CircuitBreaker | None, | ||
| ) -> Generator[None]: | ||
| """Track request outcome: record_error on WebhookTimeoutError, record_success on normal exit. | ||
|
|
||
| Handles the None case as a no-op so callers don't need nullcontext(). | ||
| """ | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError | ||
|
Christinarlong marked this conversation as resolved.
|
||
|
|
||
| if breaker is None: | ||
| yield | ||
| return | ||
| try: | ||
| yield | ||
|
|
||
| # Currently we only count WebhookTimeoutError as an error in the circuit breaker as those operations are the ones that are taking too long | ||
| # If an app returns a say 500, in a reasonable time that's okay | ||
| except WebhookTimeoutError: | ||
| # This is gross but we don't want to propagate a redis or circuit breaker error to the webhook code | ||
| try: | ||
| breaker.record_error() | ||
| except Exception: | ||
| logger.exception("sentry_apps.circuit_breaker.record_error.failure") | ||
| raise | ||
|
sentry[bot] marked this conversation as resolved.
|
||
| else: | ||
| try: | ||
| breaker.record_success() | ||
| except Exception: | ||
| logger.exception("sentry_apps.circuit_breaker.record_success.failure") | ||
|
Christinarlong marked this conversation as resolved.
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |
| from collections.abc import Callable, Mapping | ||
| from types import FrameType | ||
| from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar | ||
| from urllib.parse import urlparse | ||
|
|
||
| import sentry_sdk | ||
| from requests import RequestException, Response | ||
|
|
@@ -13,6 +14,8 @@ | |
| from sentry import features, options | ||
| from sentry.exceptions import RestrictedIPAddress | ||
| from sentry.http import safe_urlopen | ||
| from sentry.integrations.utils.metrics import EventLifecycle | ||
| from sentry.organizations.services.organization.model import RpcUserOrganizationContext | ||
| from sentry.organizations.services.organization.service import organization_service | ||
| from sentry.sentry_apps.metrics import ( | ||
| SentryAppEventType, | ||
|
|
@@ -23,7 +26,10 @@ | |
| from sentry.sentry_apps.utils.errors import SentryAppSentryError | ||
| from sentry.shared_integrations.exceptions import ApiHostError, ApiTimeoutError, ClientError | ||
| from sentry.taskworker.timeout import timeout_alarm | ||
| from sentry.utils import metrics | ||
| from sentry.utils.circuit_breaker2 import CircuitBreaker, RateBasedTripStrategy | ||
| from sentry.utils.sentry_apps import SentryAppWebhookRequestsBuffer | ||
| from sentry.utils.sentry_apps.circuit_breaker import circuit_breaker_tracking | ||
|
|
||
| if TYPE_CHECKING: | ||
| from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent | ||
|
|
@@ -51,7 +57,6 @@ def _handle_webhook_timeout(signum: int, frame: FrameType | None) -> None: | |
| """Handler for when a webhook request exceeds the hard timeout deadline. | ||
| - This is a workaround for safe_create_connection sockets hanging when the given url | ||
| cannot be reached or resolved. | ||
| - TODO(christinarlong): Add sentry app disabling logic here | ||
| """ | ||
| raise WebhookTimeoutError("Webhook request exceeded hard timeout deadline") | ||
|
|
||
|
|
@@ -73,6 +78,79 @@ def wrapper( | |
| return wrapper | ||
|
|
||
|
|
||
| def _create_circuit_breaker( | ||
| sentry_app: SentryApp | RpcSentryApp, | ||
| organization_context: RpcUserOrganizationContext | None, | ||
| ) -> CircuitBreaker | None: | ||
| if organization_context is None or not features.has( | ||
| "organizations:sentry-app-webhook-circuit-breaker", | ||
| organization_context.organization, | ||
| ): | ||
| return None | ||
| config = options.get("sentry-apps.webhook.circuit-breaker.config") | ||
| return CircuitBreaker( | ||
| key=f"sentry-app.webhook.{sentry_app.slug}", | ||
| config=config, | ||
| trip_strategy=RateBasedTripStrategy.from_config(config), | ||
| ) | ||
|
Christinarlong marked this conversation as resolved.
|
||
|
|
||
|
|
||
| def _circuit_breaker_allows_request( | ||
| circuit_breaker: CircuitBreaker | None, | ||
| sentry_app: SentryApp | RpcSentryApp, | ||
| org_id: int, | ||
| lifecycle: EventLifecycle, | ||
| ) -> bool: | ||
| if circuit_breaker is None or circuit_breaker.should_allow_request(): | ||
| return True | ||
|
|
||
| dry_run = options.get("sentry-apps.webhook.circuit-breaker.dry-run") | ||
| if dry_run: | ||
| metrics.incr( | ||
| "sentry_app.webhook.circuit_breaker.would_block", | ||
| tags={"slug": sentry_app.slug}, | ||
|
Christinarlong marked this conversation as resolved.
|
||
| ) | ||
| logger.warning( | ||
| "sentry_app.webhook.circuit_breaker.would_block", | ||
| extra={"slug": sentry_app.slug, "org_id": org_id}, | ||
| ) | ||
| return True | ||
|
|
||
| lifecycle.record_halt( | ||
| halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CIRCUIT_BROKEN}" | ||
| ) | ||
| return False | ||
|
|
||
|
|
||
| def _send_webhook_request( | ||
| url: str, | ||
| app_platform_event: AppPlatformEvent[T], | ||
| organization_context: RpcUserOrganizationContext | None, | ||
| ) -> Response: | ||
| if organization_context is not None and features.has( | ||
| "organizations:sentry-app-webhook-hard-timeout", | ||
| organization_context.organization, | ||
| ): | ||
| # We're using a signal based timeout here because we need to interrupt the blocking | ||
| # socket.connect() operation. See SENTRY-5HA6 for more context. Here we're hanging at | ||
| # the socket.connect() call and the timeout we set in safe_urlopen is not being respected. | ||
| timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec") | ||
| with timeout_alarm(timeout_seconds, _handle_webhook_timeout): | ||
| return safe_urlopen( | ||
| url=url, | ||
| data=app_platform_event.body, | ||
| headers=app_platform_event.headers, | ||
| timeout=options.get("sentry-apps.webhook.timeout.sec"), | ||
| ) | ||
|
|
||
| return safe_urlopen( | ||
| url=url, | ||
| data=app_platform_event.body, | ||
| headers=app_platform_event.headers, | ||
| timeout=options.get("sentry-apps.webhook.timeout.sec"), | ||
| ) | ||
|
|
||
|
|
||
| @sentry_sdk.trace(name="send_and_save_webhook_request") | ||
| @ignore_unpublished_app_errors | ||
| def send_and_save_webhook_request( | ||
|
|
@@ -124,28 +202,12 @@ def send_and_save_webhook_request( | |
| include_projects=False, | ||
| include_teams=False, | ||
| ) | ||
| if organization_context is not None and features.has( | ||
| "organizations:sentry-app-webhook-hard-timeout", | ||
| organization_context.organization, | ||
| ): | ||
| # We're using a signal based timeout here because we need to interrupt the blocking socket.connect() opeartion. | ||
| # See SENTRY-5HA6 for more context. Here we're hanging at the socket.connect() call and the timeout we set | ||
| # in safe_urlopen is not being respected. | ||
| timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec") | ||
| with timeout_alarm(timeout_seconds, _handle_webhook_timeout): | ||
| response = safe_urlopen( | ||
| url=url, | ||
| data=app_platform_event.body, | ||
| headers=app_platform_event.headers, | ||
| timeout=options.get("sentry-apps.webhook.timeout.sec"), | ||
| ) | ||
| else: | ||
| response = safe_urlopen( | ||
| url=url, | ||
| data=app_platform_event.body, | ||
| headers=app_platform_event.headers, | ||
| timeout=options.get("sentry-apps.webhook.timeout.sec"), | ||
| ) | ||
| circuit_breaker = _create_circuit_breaker(sentry_app, organization_context) | ||
|
Christinarlong marked this conversation as resolved.
|
||
| if not _circuit_breaker_allows_request(circuit_breaker, sentry_app, org_id, lifecycle): | ||
| return Response() | ||
|
|
||
| with circuit_breaker_tracking(circuit_breaker): | ||
| response = _send_webhook_request(url, app_platform_event, organization_context) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Circuit breaker errors can crash webhook delivery for published appsMedium Severity
Additional Locations (1)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, we have retries for uncaught errors at this step so idt it's that big of a deal |
||
|
|
||
| except WebhookTimeoutError: | ||
| lifecycle.record_halt( | ||
|
|
@@ -186,13 +248,19 @@ def send_and_save_webhook_request( | |
| raise | ||
|
|
||
| track_response_code(response.status_code, slug, event) | ||
|
|
||
| project_id = ( | ||
| int(p_id) | ||
| if (p_id := response.headers.get("Sentry-Hook-Project")) and p_id.isdigit() | ||
| else None | ||
| ) | ||
| buffer.add_request( | ||
| response_code=response.status_code, | ||
| org_id=org_id, | ||
| event=event, | ||
| url=url, | ||
| error_id=response.headers.get("Sentry-Hook-Error"), | ||
| project_id=response.headers.get("Sentry-Hook-Project"), | ||
| project_id=project_id, | ||
| response=response, | ||
| headers=app_platform_event.headers, | ||
| ) | ||
|
|
@@ -223,13 +291,15 @@ def send_and_save_webhook_request( | |
| lifecycle.record_halt( | ||
| halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" | ||
| ) | ||
| raise ApiHostError.from_request(response.request) | ||
| raise ApiHostError(f"Unable to reach host: {urlparse(url).netloc}", url=url) | ||
|
|
||
| elif response.status_code == status.HTTP_504_GATEWAY_TIMEOUT: | ||
| lifecycle.record_halt( | ||
| halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}" | ||
| ) | ||
| raise ApiTimeoutError.from_request(response.request) | ||
| raise ApiTimeoutError( | ||
| f"Timed out attempting to reach host: {urlparse(url).netloc}", url=url | ||
| ) | ||
|
|
||
| elif 400 <= response.status_code < 500: | ||
| lifecycle.record_halt( | ||
|
|
@@ -243,4 +313,5 @@ def send_and_save_webhook_request( | |
| except RequestException as e: | ||
| lifecycle.record_halt(e) | ||
| raise | ||
|
|
||
| return response | ||


Uh oh!
There was an error while loading. Please reload this page.