|
| 1 | +import logging |
| 2 | +from collections.abc import Mapping |
| 3 | +from datetime import timedelta |
| 4 | +from functools import cache |
| 5 | + |
| 6 | +from django.db import router, transaction |
| 7 | +from rest_framework import status |
| 8 | + |
| 9 | +from sentry import features |
| 10 | +from sentry.api.exceptions import SentryAPIException |
| 11 | +from sentry.grouping.grouptype import ErrorGroupType |
| 12 | +from sentry.incidents.grouptype import MetricIssue |
| 13 | +from sentry.incidents.models.alert_rule import AlertRuleDetectionType |
| 14 | +from sentry.incidents.utils.constants import INCIDENTS_SNUBA_SUBSCRIPTION_TYPE |
| 15 | +from sentry.incidents.utils.types import DATA_SOURCE_SNUBA_QUERY_SUBSCRIPTION |
| 16 | +from sentry.issue_detection.performance_detection import PERFORMANCE_DETECTOR_CONFIG_MAPPINGS |
| 17 | +from sentry.issues import grouptype |
| 18 | +from sentry.locks import locks |
| 19 | +from sentry.models.project import Project |
| 20 | +from sentry.projectoptions.defaults import DEFAULT_PROJECT_PERFORMANCE_DETECTION_SETTINGS |
| 21 | +from sentry.seer.anomaly_detection.store_data_workflow_engine import send_new_detector_data |
| 22 | +from sentry.seer.anomaly_detection.types import ( |
| 23 | + AnomalyDetectionSeasonality, |
| 24 | + AnomalyDetectionSensitivity, |
| 25 | + AnomalyDetectionThresholdType, |
| 26 | +) |
| 27 | +from sentry.snuba.dataset import Dataset |
| 28 | +from sentry.snuba.models import SnubaQuery, SnubaQueryEventType |
| 29 | +from sentry.snuba.subscriptions import create_snuba_query, create_snuba_subscription |
| 30 | +from sentry.utils.locking import UnableToAcquireLock |
| 31 | +from sentry.workflow_engine.models import ( |
| 32 | + DataCondition, |
| 33 | + DataConditionGroup, |
| 34 | + DataSource, |
| 35 | + DataSourceDetector, |
| 36 | + Detector, |
| 37 | +) |
| 38 | +from sentry.workflow_engine.models.data_condition import Condition |
| 39 | +from sentry.workflow_engine.types import ( |
| 40 | + ERROR_DETECTOR_NAME, |
| 41 | + ISSUE_STREAM_DETECTOR_NAME, |
| 42 | + DetectorPriorityLevel, |
| 43 | +) |
| 44 | +from sentry.workflow_engine.typings.grouptype import IssueStreamGroupType |
| 45 | + |
| 46 | +VALID_DEFAULT_DETECTOR_TYPES = [ |
| 47 | + ErrorGroupType.slug, |
| 48 | + IssueStreamGroupType.slug, |
| 49 | + *[m.wfe_detector_type for m in PERFORMANCE_DETECTOR_CONFIG_MAPPINGS.values()], |
| 50 | +] |
| 51 | + |
| 52 | +logger = logging.getLogger(__name__) |
| 53 | + |
| 54 | + |
| 55 | +@cache |
| 56 | +def get_disabled_platforms_by_detector_type() -> Mapping[str, frozenset[str]]: |
| 57 | + """ |
| 58 | + Map WFE detector types to platforms where they should be disabled by default. |
| 59 | + Derives from DEFAULT_DETECTOR_DISABLING_CONFIGS using the detection_enabled_key. |
| 60 | + """ |
| 61 | + from sentry.issue_detection.detectors.disable_detectors import ( |
| 62 | + DEFAULT_DETECTOR_DISABLING_CONFIGS, |
| 63 | + ) |
| 64 | + |
| 65 | + disabled_by_detector_type: dict[str, frozenset[str]] = {} |
| 66 | + |
| 67 | + for disable_config in DEFAULT_DETECTOR_DISABLING_CONFIGS: |
| 68 | + detector_option_key = disable_config["detector_project_option"] |
| 69 | + languages_to_disable = disable_config["languages_to_disable"] |
| 70 | + |
| 71 | + # Find matching WFE detector via detection_enabled_key |
| 72 | + for mapping in PERFORMANCE_DETECTOR_CONFIG_MAPPINGS.values(): |
| 73 | + if mapping.detection_enabled_key == detector_option_key: |
| 74 | + disabled_by_detector_type[mapping.wfe_detector_type] = frozenset( |
| 75 | + languages_to_disable |
| 76 | + ) |
| 77 | + break |
| 78 | + |
| 79 | + return disabled_by_detector_type |
| 80 | + |
| 81 | + |
| 82 | +class UnableToAcquireLockApiError(SentryAPIException): |
| 83 | + status_code = status.HTTP_400_BAD_REQUEST |
| 84 | + code = "unable_to_acquire_lock" |
| 85 | + message = "Unable to acquire lock for issue alert migration." |
| 86 | + |
| 87 | + |
| 88 | +def _ensure_detector(project: Project, type: str, default_enabled: bool = True) -> Detector: |
| 89 | + """ |
| 90 | + Ensure that a detector of a given type exists for a project. |
| 91 | + If the Detector doesn't already exist, we try to acquire a lock to avoid double-creating, |
| 92 | + and UnableToAcquireLockApiError if that fails. |
| 93 | + """ |
| 94 | + group_type = grouptype.registry.get_by_slug(type) |
| 95 | + if not group_type: |
| 96 | + raise ValueError(f"Group type {type} not registered") |
| 97 | + slug = group_type.slug |
| 98 | + if slug not in VALID_DEFAULT_DETECTOR_TYPES: |
| 99 | + raise ValueError(f"Invalid default detector type: {slug}") |
| 100 | + |
| 101 | + # If it already exists, life is simple and we can return immediately. |
| 102 | + # If there happen to be duplicates, we prefer the oldest. |
| 103 | + existing = Detector.objects.filter(type=slug, project=project).order_by("id").first() |
| 104 | + if existing: |
| 105 | + return existing |
| 106 | + |
| 107 | + # If we may need to create it, we acquire a lock to avoid double-creating. |
| 108 | + # There isn't a unique constraint on the detector, so we can't rely on get_or_create |
| 109 | + # to avoid duplicates. |
| 110 | + # However, by only locking during the one-time creation, the window for a race condition is small. |
| 111 | + lock = locks.get( |
| 112 | + f"workflow-engine-project-{slug}-detector:{project.id}", |
| 113 | + duration=2, |
| 114 | + name=f"workflow_engine_default_{slug}_detector", |
| 115 | + ) |
| 116 | + try: |
| 117 | + with ( |
| 118 | + # Creation should be fast, so it's worth blocking a little rather |
| 119 | + # than failing a request. |
| 120 | + lock.blocking_acquire(initial_delay=0.1, timeout=3), |
| 121 | + transaction.atomic(router.db_for_write(Detector)), |
| 122 | + ): |
| 123 | + detector, _ = Detector.objects.get_or_create( |
| 124 | + type=slug, |
| 125 | + project=project, |
| 126 | + defaults={ |
| 127 | + "config": {}, |
| 128 | + "name": ( |
| 129 | + ERROR_DETECTOR_NAME |
| 130 | + if slug == ErrorGroupType.slug |
| 131 | + else ISSUE_STREAM_DETECTOR_NAME |
| 132 | + if slug == IssueStreamGroupType.slug |
| 133 | + else group_type.description |
| 134 | + ), |
| 135 | + "enabled": default_enabled, |
| 136 | + }, |
| 137 | + ) |
| 138 | + return detector |
| 139 | + except UnableToAcquireLock: |
| 140 | + raise UnableToAcquireLockApiError |
| 141 | + |
| 142 | + |
| 143 | +def ensure_default_anomaly_detector( |
| 144 | + project: Project, owner_team_id: int | None = None, enabled: bool = True |
| 145 | +) -> Detector | None: |
| 146 | + """ |
| 147 | + Ensure that a default anomaly detection metric monitor exists for a project. |
| 148 | + If the Detector doesn't already exist, we try to acquire a lock to avoid double-creating. |
| 149 | + """ |
| 150 | + # If it already exists, return immediately. Prefer the oldest if duplicates exist. |
| 151 | + existing = ( |
| 152 | + Detector.objects.filter(type=MetricIssue.slug, project=project).order_by("id").first() |
| 153 | + ) |
| 154 | + if existing: |
| 155 | + logger.info( |
| 156 | + "create_default_anomaly_detector.already_exists", |
| 157 | + extra={"project_id": project.id, "detector_id": existing.id}, |
| 158 | + ) |
| 159 | + return existing |
| 160 | + |
| 161 | + lock = locks.get( |
| 162 | + f"workflow-engine-project-{MetricIssue.slug}-detector:{project.id}", |
| 163 | + duration=2, |
| 164 | + name=f"workflow_engine_default_{MetricIssue.slug}_detector", |
| 165 | + ) |
| 166 | + try: |
| 167 | + with ( |
| 168 | + lock.blocking_acquire(initial_delay=0.1, timeout=3), |
| 169 | + transaction.atomic(router.db_for_write(Detector)), |
| 170 | + ): |
| 171 | + # Double-check after acquiring lock in case another process created it |
| 172 | + existing = ( |
| 173 | + Detector.objects.filter(type=MetricIssue.slug, project=project) |
| 174 | + .order_by("id") |
| 175 | + .first() |
| 176 | + ) |
| 177 | + if existing: |
| 178 | + return existing |
| 179 | + |
| 180 | + try: |
| 181 | + condition_group = DataConditionGroup.objects.create( |
| 182 | + logic_type=DataConditionGroup.Type.ANY, |
| 183 | + organization_id=project.organization_id, |
| 184 | + ) |
| 185 | + |
| 186 | + DataCondition.objects.create( |
| 187 | + comparison={ |
| 188 | + "sensitivity": AnomalyDetectionSensitivity.LOW, |
| 189 | + "seasonality": AnomalyDetectionSeasonality.AUTO, |
| 190 | + "threshold_type": AnomalyDetectionThresholdType.ABOVE, |
| 191 | + }, |
| 192 | + condition_result=DetectorPriorityLevel.HIGH, |
| 193 | + type=Condition.ANOMALY_DETECTION, |
| 194 | + condition_group=condition_group, |
| 195 | + ) |
| 196 | + |
| 197 | + detector = Detector.objects.create( |
| 198 | + project=project, |
| 199 | + name="High Error Count (Default)", |
| 200 | + description="Automatically monitors for anomalous spikes in error count", |
| 201 | + workflow_condition_group=condition_group, |
| 202 | + type=MetricIssue.slug, |
| 203 | + config={ |
| 204 | + "detection_type": AlertRuleDetectionType.DYNAMIC.value, |
| 205 | + "comparison_delta": None, |
| 206 | + }, |
| 207 | + owner_team_id=owner_team_id, |
| 208 | + enabled=enabled, |
| 209 | + ) |
| 210 | + |
| 211 | + snuba_query = create_snuba_query( |
| 212 | + query_type=SnubaQuery.Type.ERROR, |
| 213 | + dataset=Dataset.Events, |
| 214 | + query="", |
| 215 | + aggregate="count()", |
| 216 | + time_window=timedelta(minutes=15), |
| 217 | + resolution=timedelta(minutes=15), |
| 218 | + environment=None, |
| 219 | + event_types=[SnubaQueryEventType.EventType.ERROR], |
| 220 | + ) |
| 221 | + |
| 222 | + query_subscription = create_snuba_subscription( |
| 223 | + project=project, |
| 224 | + subscription_type=INCIDENTS_SNUBA_SUBSCRIPTION_TYPE, |
| 225 | + snuba_query=snuba_query, |
| 226 | + ) |
| 227 | + |
| 228 | + data_source = DataSource.objects.create( |
| 229 | + organization_id=project.organization_id, |
| 230 | + source_id=str(query_subscription.id), |
| 231 | + type=DATA_SOURCE_SNUBA_QUERY_SUBSCRIPTION, |
| 232 | + ) |
| 233 | + |
| 234 | + DataSourceDetector.objects.create( |
| 235 | + data_source=data_source, |
| 236 | + detector=detector, |
| 237 | + ) |
| 238 | + except Exception: |
| 239 | + logger.exception( |
| 240 | + "create_default_anomaly_detector.create_models_failed", |
| 241 | + extra={"project_id": project.id, "organization_id": project.organization_id}, |
| 242 | + ) |
| 243 | + raise |
| 244 | + |
| 245 | + try: |
| 246 | + send_new_detector_data(detector) |
| 247 | + except Exception: |
| 248 | + logger.exception( |
| 249 | + "create_default_anomaly_detector.send_to_seer_failed", |
| 250 | + extra={"project_id": project.id, "organization_id": project.organization_id}, |
| 251 | + ) |
| 252 | + raise |
| 253 | + |
| 254 | + return detector |
| 255 | + except UnableToAcquireLock: |
| 256 | + raise UnableToAcquireLockApiError |
| 257 | + |
| 258 | + |
| 259 | +def ensure_performance_detectors(project: Project) -> dict[str, Detector]: |
| 260 | + if not features.has("projects:workflow-engine-performance-detectors", project): |
| 261 | + return {} |
| 262 | + |
| 263 | + disabled_platforms_map = get_disabled_platforms_by_detector_type() |
| 264 | + |
| 265 | + detectors = {} |
| 266 | + for mapping in PERFORMANCE_DETECTOR_CONFIG_MAPPINGS.values(): |
| 267 | + detector_type = mapping.wfe_detector_type |
| 268 | + |
| 269 | + # Determine initial enabled state based on platform and default settings |
| 270 | + disabled_platforms = disabled_platforms_map.get(detector_type, frozenset()) |
| 271 | + default_enabled = DEFAULT_PROJECT_PERFORMANCE_DETECTION_SETTINGS[ |
| 272 | + mapping.detection_enabled_key |
| 273 | + ] |
| 274 | + enabled = (project.platform not in disabled_platforms) and default_enabled |
| 275 | + |
| 276 | + detectors[detector_type] = _ensure_detector(project, detector_type, default_enabled=enabled) |
| 277 | + |
| 278 | + return detectors |
| 279 | + |
| 280 | + |
| 281 | +def ensure_default_detectors(project: Project) -> dict[str, Detector]: |
| 282 | + detectors: dict[str, Detector] = {} |
| 283 | + detectors[ErrorGroupType.slug] = _ensure_detector(project, ErrorGroupType.slug) |
| 284 | + detectors[IssueStreamGroupType.slug] = _ensure_detector(project, IssueStreamGroupType.slug) |
| 285 | + detectors.update(ensure_performance_detectors(project)) |
| 286 | + return detectors |
0 commit comments