Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
/src/sentry/projectoptions/ @getsentry/app-backend
/src/sentry/receivers/ @getsentry/app-backend
/src/sentry/ratelimits/ @getsentry/app-backend
/src/sentry/asgi.py @getsentry/app-backend

/tests/js/sentry-test/ @getsentry/app-frontend
/static/app/utils/ @getsentry/app-frontend
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ dependencies = [
"google-cloud-storage-transfer>=1.17.0",
"google-crc32c>=1.6.0",
"googleapis-common-protos>=1.63.2",
"granian[pname,reload]>=2.7",
"granian[pname,reload,uvloop]>=2.7",
"grpc-google-iam-v1>=0.13.1",
# Note, grpcio>1.30.0 requires setting GRPC_POLL_STRATEGY=epoll1
# See https://github.com/grpc/grpc/issues/23796 and
# https://github.com/grpc/grpc/blob/v1.35.x/doc/core/grpc-polling-engines.md#polling-engine-implementations-in-grpc
"grpcio>=1.67.0",
# not directly used, but provides a speedup for redis
"hiredis>=2.3.2",
"httpx>=0.28.1",
"jsonschema>=4.20.0",
"lxml>=5.3.0",
"maxminddb>=2.3.0",
Expand Down
18 changes: 18 additions & 0 deletions src/sentry/asgi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os.path
import sys

# Add the project to the python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))

# Configure the application only if it seemingly isn't already configured
from django.conf import settings

if not settings.configured:
from sentry.runner import configure

configure()

from django.core.handlers.asgi import ASGIHandler

# Run ASGI handler for the application
application = ASGIHandler()
11 changes: 10 additions & 1 deletion src/sentry/conf/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,12 @@ def env(
# so that responses aren't modified after Content-Length is set, or have the
# response modifying middleware reset the Content-Length header.
# This is because CommonMiddleware Sets the Content-Length header for non-streaming responses.
APIGW_ASYNC = os.environ.get("SENTRY_APIGW_ASYNC", "").lower() in ("1", "true", "y", "yes")
APIGW_MIDDLEWARE = (
"sentry.hybridcloud.apigateway_async.middleware.ApiGatewayMiddleware"
if APIGW_ASYNC
else "sentry.hybridcloud.apigateway.middleware.ApiGatewayMiddleware"
)
MIDDLEWARE: tuple[str, ...] = (
"csp.middleware.CSPMiddleware",
"sentry.middleware.health.HealthCheck",
Expand All @@ -387,7 +393,7 @@ def env(
"sentry.middleware.auth.AuthenticationMiddleware",
"sentry.middleware.ai_agent.AIAgentMiddleware",
"sentry.middleware.integrations.IntegrationControlMiddleware",
"sentry.hybridcloud.apigateway.middleware.ApiGatewayMiddleware",
APIGW_MIDDLEWARE,
"sentry.middleware.demo_mode_guard.DemoModeGuardMiddleware",
"sentry.middleware.customer_domain.CustomerDomainMiddleware",
"sentry.middleware.sudo.SudoMiddleware",
Expand Down Expand Up @@ -3181,6 +3187,9 @@ def custom_parameter_sort(parameter: dict) -> tuple[str, int]:
}
# Used in tests to skip forwarding relay paths to a region silo that does not exist.
APIGATEWAY_PROXY_SKIP_RELAY = False
APIGATEWAY_PROXY_MAX_CONCURRENCY = int(os.environ.get("SENTRY_APIGW_PROXY_MAX_CONCURRENCY", 100))
APIGATEWAY_PROXY_MAX_FAILURES = int(os.environ.get("SENTRY_APIGW_PROXY_MAX_FAILURES", 100))
APIGATEWAY_PROXY_FAILURE_WINDOW = int(os.environ.get("SENTRY_APIGW_PROXY_FAILURE_WINDOW", 60))

# Shared resource ids for accounting
EVENT_PROCESSING_STORE = "rc_processing_redis"
Expand Down
3 changes: 3 additions & 0 deletions src/sentry/hybridcloud/apigateway_async/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .apigateway import proxy_request_if_needed

__all__ = ("proxy_request_if_needed",)
108 changes: 108 additions & 0 deletions src/sentry/hybridcloud/apigateway_async/apigateway.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from __future__ import annotations

import logging
from collections.abc import Callable
from typing import Any

from django.conf import settings
from django.http.response import HttpResponseBase
from rest_framework.request import Request

from sentry.silo.base import SiloLimit, SiloMode
from sentry.types.cell import get_cell_by_name
from sentry.utils import metrics

from .proxy import (
proxy_cell_request,
proxy_error_embed_request,
proxy_request,
)

logger = logging.getLogger(__name__)


def _get_view_silo_mode(view_func: Callable[..., HttpResponseBase]) -> frozenset[SiloMode] | None:
view_class = getattr(view_func, "view_class", None)
if not view_class:
return None
if not hasattr(view_class, "silo_limit"):
return None
endpoint_silo_limit: SiloLimit = view_class.silo_limit
return endpoint_silo_limit.modes


async def proxy_request_if_needed(
request: Request,
view_func: Callable[..., HttpResponseBase],
view_kwargs: dict[str, Any],
) -> HttpResponseBase | None:
"""
Main execution flow for the API Gateway.
returns None if proxying is not required, or a response if the proxy was successful.
"""
current_silo_mode = SiloMode.get_current_mode()
if current_silo_mode != SiloMode.CONTROL:
return None

silo_modes = _get_view_silo_mode(view_func)
if not silo_modes or current_silo_mode in silo_modes:
return None

url_name = "unknown"
if request.resolver_match:
url_name = request.resolver_match.url_name or url_name

if "organization_slug" in view_kwargs or "organization_id_or_slug" in view_kwargs:
org_id_or_slug = str(
view_kwargs.get("organization_slug") or view_kwargs.get("organization_id_or_slug", "")
)

metrics.incr(
"apigateway.proxy_request",
tags={
"url_name": url_name,
"kind": "orgslug",
},
)
return await proxy_request(request, org_id_or_slug, url_name)

if url_name == "sentry-error-page-embed" and "dsn" in request.GET:
# Error embed modal is special as customers can't easily use cell URLs.
dsn = request.GET["dsn"]
metrics.incr(
"apigateway.proxy_request",
tags={
"url_name": url_name,
"kind": "error-embed",
},
)
return await proxy_error_embed_request(request, dsn, url_name)

if (
request.resolver_match
and request.resolver_match.url_name in settings.REGION_PINNED_URL_NAMES
):
cell = get_cell_by_name(settings.SENTRY_MONOLITH_REGION)
metrics.incr(
"apigateway.proxy_request",
tags={
"url_name": url_name,
"kind": "regionpin",
},
)

return await proxy_cell_request(request, cell, url_name)

if url_name != "unknown":
# If we know the URL but didn't proxy it record we could be missing
# URL handling and that needs to be fixed.
metrics.incr(
"apigateway.proxy_request",
tags={
"kind": "noop",
"url_name": url_name,
},
)
logger.info("apigateway.unknown_url", extra={"url": request.path})

return None
104 changes: 104 additions & 0 deletions src/sentry/hybridcloud/apigateway_async/circuitbreaker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from __future__ import annotations

import asyncio
import time
from collections import defaultdict
from typing import Any

from django.conf import settings


class CircuitBreaker:
__slots__ = [
"concurrency",
"counter_window",
"failures",
"semaphore",
"_clock",
"_counters",
"_counter_idx",
]

def __init__(self, concurrency: int, failures: tuple[int, int]) -> None:
self.concurrency = concurrency
self.counter_window = failures[0]
self.failures = failures[1]
self.semaphore = asyncio.Semaphore(self.concurrency)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're running multiple replicas there doesn't seem to be a way to circuit break across the deployment, each replica will have to figure out that there is a problem independently. I'm guessing you went with this approach because our existing circuit breakers use sync-redis?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly for that reason, yes.

But also, I don't see that much of a value in having the circuit breaking data shared across the whole deployment (but also the next deployment when we release stuff). Having each worker (not the whole pod) with its own state IMO allows us to configure more granular limits, and avoid conditions in which we overflow the concurrency circuit because of scaling and the amount of running control pods. There might also be conditions in which a single pod is failing to connect to the upstream for $REASONS (eg: machine specific temporary network issues) and I don't think we want that single pod to overload the circuit for everything else.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's all fair. We'll be able to use metrics to see how many gateways instances have breakers open as well, and observe how coherent that state is too.

self._clock = 0
self._counters = [0, 0]
self._counter_idx = 0
Comment thread
gi0baro marked this conversation as resolved.

def _counter_flip(self, clock: int) -> None:
self._clock = clock
prev = self._counter_idx
self._counter_idx = 1 - prev
self._counters[prev] = 0

def _maybe_counter_flip(self) -> None:
now = int(time.monotonic())
delta = now - self._clock
if delta > 0:
if delta // self.counter_window:
self._counter_flip(now)

def counter_incr(self) -> None:
self._maybe_counter_flip()
self._counters[self._counter_idx] += 1

def window_overflow(self) -> bool:
self._maybe_counter_flip()
return self._counters[self._counter_idx] > self.failures
Comment thread
cursor[bot] marked this conversation as resolved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Circuit breaker off-by-one in failure threshold check

Low Severity

The window_overflow method uses > instead of >= when comparing the failure counter against self.failures (sourced from APIGATEWAY_PROXY_MAX_FAILURES). This means if the max is configured to 100, the circuit breaker actually allows 101 failures before tripping. Given the setting name implies an upper bound, >= would match the intended semantics.

Fix in Cursor Fix in Web


def overflow(self) -> bool:
return self.semaphore.locked()

def ctx(self) -> CircuitBreakerCtx:
return CircuitBreakerCtx(self)


class CircuitBreakerOverflow(Exception): ...


class CircuitBreakerWindowOverflow(Exception): ...


class CircuitBreakerCtx:
__slots__ = ["cb"]

def __init__(self, cb: CircuitBreaker):
self.cb = cb

def incr_failures(self) -> None:
self.cb.counter_incr()

async def __aenter__(self) -> CircuitBreakerCtx:
if self.cb.overflow():
raise CircuitBreakerOverflow
await self.cb.semaphore.acquire()
if self.cb.window_overflow():
self.cb.semaphore.release()
raise CircuitBreakerWindowOverflow
return self

async def __aexit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None:
self.cb.semaphore.release()


class CircuitBreakerManager:
__slots__ = ["objs"]

def __init__(
self,
max_concurrency: int | None = None,
failures: int | None = None,
failure_window: int | None = None,
):
concurrency = max_concurrency or settings.APIGATEWAY_PROXY_MAX_CONCURRENCY
failures = failures or settings.APIGATEWAY_PROXY_MAX_FAILURES
failure_window = failure_window or settings.APIGATEWAY_PROXY_FAILURE_WINDOW
self.objs: dict[str, CircuitBreaker] = defaultdict(
lambda: CircuitBreaker(concurrency, (failure_window, failures))
)

def get(self, key: str) -> CircuitBreakerCtx:
return self.objs[key].ctx()
78 changes: 78 additions & 0 deletions src/sentry/hybridcloud/apigateway_async/middleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from __future__ import annotations

import asyncio
from collections.abc import Callable
from typing import Any

from asgiref.sync import async_to_sync, iscoroutinefunction, markcoroutinefunction
from django.http.response import HttpResponseBase
from rest_framework.request import Request

from . import proxy_request_if_needed


class ApiGatewayMiddleware:
"""Proxy requests intended for remote silos"""

async_capable = True
sync_capable = True

def __init__(self, get_response: Callable[[Request], HttpResponseBase]):
self.get_response = get_response
if iscoroutinefunction(self.get_response):
markcoroutinefunction(self)

def __call__(self, request: Request) -> Any:
if iscoroutinefunction(self):
return self.__acall__(request)
return self.get_response(request)

async def __acall__(self, request: Request) -> HttpResponseBase:
return await self.get_response(request) # type: ignore[misc]

def process_view(
self,
request: Request,
view_func: Callable[..., HttpResponseBase],
view_args: tuple[str],
view_kwargs: dict[str, Any],
) -> HttpResponseBase | None:
return self._process_view_match(request, view_func, view_args, view_kwargs)

def _process_view_match(
self,
request: Request,
view_func: Callable[..., HttpResponseBase],
view_args: tuple[str],
view_kwargs: dict[str, Any],
) -> Any:
#: we check if we're in an async or sync runtime once, then
# overwrite the method with the actual impl.
try:
asyncio.get_running_loop()
method = self._process_view_inner
except RuntimeError:
method = self._process_view_sync # type: ignore[assignment]
setattr(self, "_process_view_match", method)
return method(request, view_func, view_args, view_kwargs)
Comment thread
gi0baro marked this conversation as resolved.

def _process_view_sync(
self,
request: Request,
view_func: Callable[..., HttpResponseBase],
view_args: tuple[str],
view_kwargs: dict[str, Any],
) -> HttpResponseBase | None:
return async_to_sync(self._process_view_inner)(request, view_func, view_args, view_kwargs)

async def _process_view_inner(
self,
request: Request,
view_func: Callable[..., HttpResponseBase],
view_args: tuple[str],
view_kwargs: dict[str, Any],
) -> HttpResponseBase | None:
proxy_response = await proxy_request_if_needed(request, view_func, view_kwargs)
if proxy_response is not None:
return proxy_response
return None
Loading
Loading