Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
8e9634f
reuse taskworker alarm for webhooks
Christinarlong Nov 26, 2025
cb4c18f
Merge branch 'master' into crl/webhook-alarm
Christinarlong Dec 18, 2025
62644f7
update tests for webhook alarm
Christinarlong Dec 18, 2025
746fdf6
Merge branch 'master' into crl/webhook-alarm
Christinarlong Feb 17, 2026
099cf92
Merge branch 'master' into crl/webhook-alarm
Christinarlong Feb 19, 2026
93fd2c1
implement circuit breaker
Christinarlong Feb 19, 2026
21d422c
update typing in tests
Christinarlong Feb 19, 2026
417e8fc
Merge branch 'crl/webhook-alarm' into crl/impl-circuit-breaker
Christinarlong Feb 19, 2026
cf8357b
move notifying to after the commit
Christinarlong Feb 19, 2026
16f2113
Merge branch 'crl/webhook-alarm' into crl/impl-circuit-breaker
Christinarlong Feb 19, 2026
1d1b904
add back the is_authenticated guard
Christinarlong Feb 19, 2026
3fa148b
Merge branch 'crl/webhook-alarm' into crl/impl-circuit-breaker
Christinarlong Feb 19, 2026
791699d
add ffs for circuit breaker
Christinarlong Feb 19, 2026
e755d7f
add circuit breaker to webhook sending
Christinarlong Feb 19, 2026
0efb782
change WebhookTimeoutError from BaseException to Exception
Christinarlong Feb 20, 2026
75f1fcf
Merge branch 'crl/webhook-alarm' into crl/impl-circuit-breaker
Christinarlong Feb 20, 2026
9965fa0
Merge branch 'crl/impl-circuit-breaker' into crl/add-ff-to-breaker
Christinarlong Feb 20, 2026
7e4e58b
Merge branch 'crl/add-ff-to-breaker' into crl/add-breaker-to-webhook-…
Christinarlong Feb 20, 2026
7f11bed
Merge remote-tracking branch 'origin/master' into crl/webhook-alarm
Christinarlong Feb 20, 2026
3acaeda
Merge branch 'crl/webhook-alarm' into crl/impl-circuit-breaker
Christinarlong Feb 20, 2026
02bd807
Merge branch 'crl/impl-circuit-breaker' into crl/add-ff-to-breaker
Christinarlong Feb 20, 2026
ff24594
Merge branch 'crl/add-ff-to-breaker' into crl/add-breaker-to-webhook-…
Christinarlong Feb 20, 2026
0c41b3e
merge master into circuit breaker connection
Christinarlong Mar 26, 2026
c492fb5
adjust circuit breaker usage to be the updated one
Christinarlong Mar 27, 2026
94f985d
duplicate enum
Christinarlong Mar 27, 2026
a01f42c
update region silo test to cell silo
Christinarlong Mar 27, 2026
8901b14
update region silo test to cell silo
Christinarlong Mar 27, 2026
07d4036
test name collision
Christinarlong Mar 27, 2026
ab52d21
Merge branch 'master' into crl/add-breaker-to-webhook-sending
Christinarlong Mar 27, 2026
85538e5
Merge branch 'master' into crl/add-breaker-to-webhook-sending
Christinarlong Mar 30, 2026
1126a3d
update codeowners and defautls
Christinarlong Mar 30, 2026
0c02937
break out webhook circuit breaker logic to helpers
Christinarlong Mar 30, 2026
000127d
fix typing and metric key
Christinarlong Mar 30, 2026
26eee2c
circuit breaker comment and test fixes
Christinarlong Mar 30, 2026
3688925
gosh this code is despairge
Christinarlong Mar 31, 2026
c681859
address pr nits
Christinarlong Mar 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,7 @@ tests/sentry/api/endpoints/test_organization_attribute_mappings.py @get
/src/sentry/sentry_apps/ @getsentry/product-owners-settings-integrations @getsentry/ecosystem
/tests/sentry/sentry_apps/ @getsentry/product-owners-settings-integrations @getsentry/ecosystem
/src/sentry/utils/sentry_apps/ @getsentry/ecosystem
/tests/sentry/utils/sentry_apps/ @getsentry/ecosystem
/src/sentry/middleware/integrations/ @getsentry/ecosystem
/src/sentry/api/endpoints/project_rule*.py @getsentry/alerts-notifications
/src/sentry/api/serializers/models/rule.py @getsentry/alerts-notifications
Expand Down
2 changes: 2 additions & 0 deletions src/sentry/features/temporary.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,8 @@ def register_temporary_features(manager: FeatureManager) -> None:
manager.add("organizations:conduit-demo", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
# Enable hard timeout alarm for webhooks
manager.add("organizations:sentry-app-webhook-hard-timeout", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
# Enable circuit breaker for webhook endpoint failure detection
manager.add("organizations:sentry-app-webhook-circuit-breaker", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)

# Enables organization access to the new notification platform
manager.add("organizations:notification-platform.internal-testing", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
Expand Down
23 changes: 23 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -2676,6 +2676,29 @@
flags=FLAG_AUTOMATOR_MODIFIABLE,
)

# Circuit breaker configuration for webhook endpoint failure detection.
# Keys match RateBasedTripStrategyConfig + CircuitBreakerConfig
register(
"sentry-apps.webhook.circuit-breaker.config",
type=Dict,
default={
"error_limit_window": 600, # 10 minutes
"broken_state_duration": 300, # 5 minutes
"threshold": 0.5, # 50% error rate
"floor": 500, # 500 errors before error rate check applies
"metrics_key": "sentry-app.webhook", # to avoid high cardinality slug tag
},
flags=FLAG_AUTOMATOR_MODIFIABLE,
)

# When True, the circuit breaker tracks state and emits metrics but does not block requests.
register(
"sentry-apps.webhook.circuit-breaker.dry-run",
Comment thread
Christinarlong marked this conversation as resolved.
type=Bool,
default=False,
flags=FLAG_AUTOMATOR_MODIFIABLE,
)

# Enables statistical detectors for a project
register(
"statistical_detectors.enable",
Expand Down
1 change: 1 addition & 0 deletions src/sentry/sentry_apps/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class SentryAppWebhookHaltReason(StrEnum):
RESTRICTED_IP = "restricted_ip"
CONNECTION_RESET = "connection_reset"
HARD_TIMEOUT = "hard_timeout"
CIRCUIT_BROKEN = "circuit_broken"


class SentryAppExternalRequestFailureReason(StrEnum):
Expand Down
2 changes: 1 addition & 1 deletion src/sentry/shared_integrations/exceptions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,6 @@ def __init__(self, field_errors: Mapping[str, Any] | None = None) -> None:
class ClientError(RequestException):
"""4xx Error Occurred"""

def __init__(self, status_code: str, url: str, response: Response | None = None) -> None:
def __init__(self, status_code: str | int, url: str, response: Response | None = None) -> None:
http_error_msg = f"{status_code} Client Error: for url: {url}"
super().__init__(http_error_msg, response=response)
39 changes: 39 additions & 0 deletions src/sentry/utils/sentry_apps/circuit_breaker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
from collections.abc import Generator
from contextlib import contextmanager

from sentry.utils.circuit_breaker2 import CircuitBreaker

logger = logging.getLogger("sentry.sentry_apps.circuit_breaker")


@contextmanager
def circuit_breaker_tracking(
breaker: CircuitBreaker | None,
) -> Generator[None]:
"""Track request outcome: record_error on WebhookTimeoutError, record_success on normal exit.

Handles the None case as a no-op so callers don't need nullcontext().
"""
Comment thread
cursor[bot] marked this conversation as resolved.
from sentry.utils.sentry_apps.webhooks import WebhookTimeoutError
Comment thread
Christinarlong marked this conversation as resolved.

if breaker is None:
yield
return
try:
yield

# Currently we only count WebhookTimeoutError as an error in the circuit breaker as those operations are the ones that are taking too long
# If an app returns a say 500, in a reasonable time that's okay
except WebhookTimeoutError:
# This is gross but we don't want to propagate a redis or circuit breaker error to the webhook code
try:
breaker.record_error()
except Exception:
logger.exception("sentry_apps.circuit_breaker.record_error.failure")
raise
Comment thread
sentry[bot] marked this conversation as resolved.
else:
try:
breaker.record_success()
except Exception:
logger.exception("sentry_apps.circuit_breaker.record_success.failure")
Comment thread
Christinarlong marked this conversation as resolved.
123 changes: 97 additions & 26 deletions src/sentry/utils/sentry_apps/webhooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import Callable, Mapping
from types import FrameType
from typing import TYPE_CHECKING, Any, Concatenate, ParamSpec, TypeVar
from urllib.parse import urlparse

import sentry_sdk
from requests import RequestException, Response
Expand All @@ -13,6 +14,8 @@
from sentry import features, options
from sentry.exceptions import RestrictedIPAddress
from sentry.http import safe_urlopen
from sentry.integrations.utils.metrics import EventLifecycle
from sentry.organizations.services.organization.model import RpcUserOrganizationContext
from sentry.organizations.services.organization.service import organization_service
from sentry.sentry_apps.metrics import (
SentryAppEventType,
Expand All @@ -23,7 +26,10 @@
from sentry.sentry_apps.utils.errors import SentryAppSentryError
from sentry.shared_integrations.exceptions import ApiHostError, ApiTimeoutError, ClientError
from sentry.taskworker.timeout import timeout_alarm
from sentry.utils import metrics
from sentry.utils.circuit_breaker2 import CircuitBreaker, RateBasedTripStrategy
from sentry.utils.sentry_apps import SentryAppWebhookRequestsBuffer
from sentry.utils.sentry_apps.circuit_breaker import circuit_breaker_tracking

if TYPE_CHECKING:
from sentry.sentry_apps.api.serializers.app_platform_event import AppPlatformEvent
Expand Down Expand Up @@ -51,7 +57,6 @@ def _handle_webhook_timeout(signum: int, frame: FrameType | None) -> None:
"""Handler for when a webhook request exceeds the hard timeout deadline.
- This is a workaround for safe_create_connection sockets hanging when the given url
cannot be reached or resolved.
- TODO(christinarlong): Add sentry app disabling logic here
"""
raise WebhookTimeoutError("Webhook request exceeded hard timeout deadline")

Expand All @@ -73,6 +78,79 @@ def wrapper(
return wrapper


def _create_circuit_breaker(
sentry_app: SentryApp | RpcSentryApp,
organization_context: RpcUserOrganizationContext | None,
) -> CircuitBreaker | None:
if organization_context is None or not features.has(
"organizations:sentry-app-webhook-circuit-breaker",
organization_context.organization,
):
return None
config = options.get("sentry-apps.webhook.circuit-breaker.config")
return CircuitBreaker(
key=f"sentry-app.webhook.{sentry_app.slug}",
config=config,
trip_strategy=RateBasedTripStrategy.from_config(config),
)
Comment thread
Christinarlong marked this conversation as resolved.


def _circuit_breaker_allows_request(
circuit_breaker: CircuitBreaker | None,
sentry_app: SentryApp | RpcSentryApp,
org_id: int,
lifecycle: EventLifecycle,
) -> bool:
if circuit_breaker is None or circuit_breaker.should_allow_request():
return True

dry_run = options.get("sentry-apps.webhook.circuit-breaker.dry-run")
if dry_run:
metrics.incr(
"sentry_app.webhook.circuit_breaker.would_block",
tags={"slug": sentry_app.slug},
Comment thread
Christinarlong marked this conversation as resolved.
)
logger.warning(
"sentry_app.webhook.circuit_breaker.would_block",
extra={"slug": sentry_app.slug, "org_id": org_id},
)
return True

lifecycle.record_halt(
halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.CIRCUIT_BROKEN}"
)
return False


def _send_webhook_request(
url: str,
app_platform_event: AppPlatformEvent[T],
organization_context: RpcUserOrganizationContext | None,
) -> Response:
if organization_context is not None and features.has(
"organizations:sentry-app-webhook-hard-timeout",
organization_context.organization,
):
# We're using a signal based timeout here because we need to interrupt the blocking
# socket.connect() operation. See SENTRY-5HA6 for more context. Here we're hanging at
# the socket.connect() call and the timeout we set in safe_urlopen is not being respected.
timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec")
with timeout_alarm(timeout_seconds, _handle_webhook_timeout):
return safe_urlopen(
url=url,
data=app_platform_event.body,
headers=app_platform_event.headers,
timeout=options.get("sentry-apps.webhook.timeout.sec"),
)

return safe_urlopen(
url=url,
data=app_platform_event.body,
headers=app_platform_event.headers,
timeout=options.get("sentry-apps.webhook.timeout.sec"),
)


@sentry_sdk.trace(name="send_and_save_webhook_request")
@ignore_unpublished_app_errors
def send_and_save_webhook_request(
Expand Down Expand Up @@ -124,28 +202,12 @@ def send_and_save_webhook_request(
include_projects=False,
include_teams=False,
)
if organization_context is not None and features.has(
"organizations:sentry-app-webhook-hard-timeout",
organization_context.organization,
):
# We're using a signal based timeout here because we need to interrupt the blocking socket.connect() opeartion.
# See SENTRY-5HA6 for more context. Here we're hanging at the socket.connect() call and the timeout we set
# in safe_urlopen is not being respected.
timeout_seconds = options.get("sentry-apps.webhook.hard-timeout.sec")
with timeout_alarm(timeout_seconds, _handle_webhook_timeout):
response = safe_urlopen(
url=url,
data=app_platform_event.body,
headers=app_platform_event.headers,
timeout=options.get("sentry-apps.webhook.timeout.sec"),
)
else:
response = safe_urlopen(
url=url,
data=app_platform_event.body,
headers=app_platform_event.headers,
timeout=options.get("sentry-apps.webhook.timeout.sec"),
)
circuit_breaker = _create_circuit_breaker(sentry_app, organization_context)
Comment thread
Christinarlong marked this conversation as resolved.
if not _circuit_breaker_allows_request(circuit_breaker, sentry_app, org_id, lifecycle):
return Response()

with circuit_breaker_tracking(circuit_breaker):
response = _send_webhook_request(url, app_platform_event, organization_context)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Circuit breaker errors can crash webhook delivery for published apps

Medium Severity

_create_circuit_breaker and _circuit_breaker_allows_request (which calls should_allow_request()) are invoked inside a try block that only catches specific exception types (WebhookTimeoutError, Timeout, ConnectionError, etc.). An unexpected exception from these — such as a Redis failure inside should_allow_request()_should_trip()check_within_quotas(), or a KeyError from a malformed options config — would propagate uncaught, causing the webhook to fail for published apps. This is inconsistent with circuit_breaker_tracking, which carefully wraps record_error() and record_success() in try/except Exception to prevent circuit breaker infrastructure issues from breaking webhook delivery.

Additional Locations (1)
Fix in Cursor Fix in Web

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, we have retries for uncaught errors at this step so idt it's that big of a deal


except WebhookTimeoutError:
lifecycle.record_halt(
Expand Down Expand Up @@ -186,13 +248,19 @@ def send_and_save_webhook_request(
raise

track_response_code(response.status_code, slug, event)

project_id = (
int(p_id)
if (p_id := response.headers.get("Sentry-Hook-Project")) and p_id.isdigit()
else None
)
buffer.add_request(
response_code=response.status_code,
org_id=org_id,
event=event,
url=url,
error_id=response.headers.get("Sentry-Hook-Error"),
project_id=response.headers.get("Sentry-Hook-Project"),
project_id=project_id,
response=response,
headers=app_platform_event.headers,
)
Expand Down Expand Up @@ -223,13 +291,15 @@ def send_and_save_webhook_request(
lifecycle.record_halt(
halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}"
)
raise ApiHostError.from_request(response.request)
raise ApiHostError(f"Unable to reach host: {urlparse(url).netloc}", url=url)

elif response.status_code == status.HTTP_504_GATEWAY_TIMEOUT:
lifecycle.record_halt(
halt_reason=f"send_and_save_webhook_request.{SentryAppWebhookHaltReason.INTEGRATOR_ERROR}"
)
raise ApiTimeoutError.from_request(response.request)
raise ApiTimeoutError(
f"Timed out attempting to reach host: {urlparse(url).netloc}", url=url
)

elif 400 <= response.status_code < 500:
lifecycle.record_halt(
Expand All @@ -243,4 +313,5 @@ def send_and_save_webhook_request(
except RequestException as e:
lifecycle.record_halt(e)
raise

return response
2 changes: 1 addition & 1 deletion tests/sentry/sentry_apps/tasks/test_sentry_apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -1713,7 +1713,7 @@ def test_saves_error_event_id_if_in_header(self, safe_urlopen: MagicMock) -> Non
assert first_request["event_type"] == "issue.assigned"
assert first_request["organization_id"] == self.install.organization_id
assert first_request["error_id"] == "d5111da2c28645c5889d072017e3445d"
assert first_request["project_id"] == "1"
assert first_request["project_id"] == 1


@patch("sentry.utils.sentry_apps.webhooks.safe_urlopen", return_value=MockResponseInstance)
Expand Down
Empty file.
Loading
Loading