feat(snuba): add a metric in query_trace_data to see what spans report span.status "ok" but have an associated error (#112090)

constantinius · george-sentry · commit f824fd3983ef · 2026-04-09T14:13:01.000-07:00
Emit a metric for cases when we detect a span with status "ok" but errors associated. This queries the additional fields "span.status", "origin", "sdk.version" which we use as tags in the metric. Contributes to https://linear.app/getsentry/issue/TET-2102/detect-spanstatus-inconsistencies-and-report-to-sdk-teams
diff --git a/src/sentry/snuba/trace.py b/src/sentry/snuba/trace.py
@@ -5,6 +5,7 @@
 from typing import Any, Literal, NotRequired, TypedDict
 
 from sentry.uptime.subscriptions.regions import get_region_config
+from sentry.utils import metrics
 from sentry.utils.concurrent import ContextPropagatingThreadPoolExecutor
 
 logger = logging.getLogger(__name__)
@@ -634,6 +635,11 @@ def query_trace_data(
     # the thread pool, database connections can hang around as the threads are not cleaned
     # up. Because of that, tests can fail during tear down as there are active connections
     # to the database preventing a DROP.
+    metric_attributes = {"span.status", "origin", "sdk.version"}
+    all_additional_attributes = list({*(additional_attributes or []), *metric_attributes})
+    # Attributes added only for metric tagging that should not appear in the response
+    metric_only_attributes = metric_attributes - set(additional_attributes or [])
+
     errors_query = _errors_query(snuba_params, trace_id, error_id)
     occurrence_query = _perf_issues_query(snuba_params, trace_id, organization)
     uptime_query = _uptime_results_query(snuba_params, trace_id) if include_uptime else None
@@ -650,7 +656,7 @@ def query_trace_data(
             params=snuba_params,
             referrer=referrer.value,
             config=SearchResolverConfig(),
-            additional_attributes=additional_attributes,
+            additional_attributes=all_additional_attributes,
         )
         errors_future = query_thread_pool.submit(
             _run_errors_query,
@@ -775,6 +781,18 @@ def query_trace_data(
             if span["id"] in id_to_error:
                 errors = id_to_error.pop(span["id"])
                 span["errors"].extend(errors)
+                if span.get("span.status", "") == "ok":
+                    metrics.incr(
+                        "performance.trace.span_with_errors_ok_status",
+                        sample_rate=0.01,
+                        tags={
+                            "sdk_name": span.get("sdk.name", ""),
+                            "sdk_version": span.get("sdk.version", ""),
+                            "origin": span.get("origin", ""),
+                            "project_id": str(span.get("project.id", "")),
+                            "project_slug": span.get("project.slug", ""),
+                        },
+                    )
             if span["id"] in id_to_occurrence:
                 occurrences: list[TraceOccurrenceEvent] = [
                     {
@@ -803,6 +821,11 @@ def query_trace_data(
                 snuba_params.end_date.timestamp() - span_max_ts,
             )
 
+    if metric_only_attributes:
+        for span in spans_data:
+            for attr in metric_only_attributes:
+                span.pop(attr, None)
+
     with sentry_sdk.start_span(op="process.errors_data"):
         for errors in id_to_error.values():
             result.extend(errors)
diff --git a/tests/snuba/api/endpoints/test_organization_trace.py b/tests/snuba/api/endpoints/test_organization_trace.py
@@ -18,6 +18,7 @@
 from sentry.issues.issue_occurrence import IssueOccurrence
 from sentry.models.group import Group
 from sentry.search.events.types import SnubaParams
+from sentry.snuba.spans_rpc import Spans
 from sentry.snuba.trace import _run_errors_query_eap
 from sentry.testutils.cases import OccurrenceTestCase, SnubaTestCase, UptimeResultEAPTestCase
 from sentry.testutils.helpers.datetime import before_now
@@ -414,6 +415,47 @@ def test_with_errors_data_with_overlapping_span_id(self) -> None:
         assert error_event_2["event_id"] in [error.event_id, error_2.event_id]
         assert error_event_1["event_id"] != error_event_2["event_id"]
 
+    @mock.patch("sentry.snuba.trace.metrics")
+    def test_emits_metric_for_error_on_ok_span(self, mock_metrics) -> None:
+        self.load_trace()
+        _, start = self.get_start_end_from_day_ago(123)
+        root_span_id = self.root_event.data["contexts"]["trace"]["span_id"]
+        error_data = load_data(
+            "javascript",
+            timestamp=start,
+        )
+        error_data["contexts"]["trace"] = {
+            "type": "trace",
+            "trace_id": self.trace_id,
+            "span_id": root_span_id,
+        }
+        error_data["tags"] = [["transaction", "/transaction/gen1-0"]]
+        self.store_event(error_data, project_id=self.gen1_project.id)
+
+        original_run_trace_query = Spans.run_trace_query
+
+        def patched_run_trace_query(**kwargs):
+            spans = original_run_trace_query(**kwargs)
+            for span in spans:
+                if span["id"] == root_span_id:
+                    span["span.status"] = "ok"
+            return spans
+
+        with (
+            self.feature(self.FEATURES),
+            mock.patch.object(Spans, "run_trace_query", side_effect=patched_run_trace_query),
+        ):
+            response = self.client_get(
+                data={"timestamp": self.day_ago},
+            )
+        assert response.status_code == 200, response.content
+
+        mock_metrics.incr.assert_any_call(
+            "performance.trace.span_with_errors_ok_status",
+            sample_rate=0.01,
+            tags=mock.ANY,
+        )
+
     def test_with_performance_issues(self) -> None:
         self.load_trace()
         with self.feature(self.FEATURES):